All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
@ 2019-10-21 21:04 Yang, Philip
       [not found] ` <20191021210436.10571-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Yang, Philip @ 2019-10-21 21:04 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yang, Philip

If device reset/suspend/resume failed for some reason, dqm lock is
hold forever and this causes deadlock. Below is a kernel backtrace when
application open kfd after suspend/resume failed.

Instead of holding dqm lock in pre_reset and releasing dqm lock in
post_reset, add dqm->device_stopped flag which is modified in
dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
because write/read are all inside dqm lock.

For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
device_stopped flag before sending the updated runlist.

Backtrace of dqm lock deadlock:

[Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
than 120 seconds.
[Thu Oct 17 16:43:37 2019]       Not tainted
5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
[Thu Oct 17 16:43:37 2019] "echo 0 >
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
0x80000000
[Thu Oct 17 16:43:37 2019] Call Trace:
[Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
[Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
[Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
[Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
[Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
[Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]
kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
[Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
[amdgpu]
[Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
[Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
[Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
[Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
[Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
[Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
[Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
[Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
[Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
[Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
[Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c            |  6 +++---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c             |  5 -----
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++++---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h   |  1 +
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d9e36dbf13d5..40d75c39f08e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
+	if (kfd_is_locked())
+		return -EAGAIN;
+
 	process = kfd_create_process(filep);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
-	if (kfd_is_locked())
-		return -EAGAIN;
-
 	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
 		process->pasid, process->is_32bit_user_mode);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8f4b24e84964..4fa8834ce7cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 		return 0;
 	kgd2kfd_suspend(kfd);
 
-	/* hold dqm->lock to prevent further execution*/
-	dqm_lock(kfd->dqm);
-
 	kfd_signal_reset_event(kfd);
 	return 0;
 }
@@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 	if (!kfd->init_complete)
 		return 0;
 
-	dqm_unlock(kfd->dqm);
-
 	ret = kfd_resume(kfd);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 81fb545cf42c..04a40fabe9d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -915,7 +915,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
 	
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
 		return pm_init(&dqm->packets, dqm);
-	
+	dqm->device_stopped = false;
+
 	return 0;
 }
 
@@ -923,7 +924,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
 {
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
 		pm_uninit(&dqm->packets);
-	
+	dqm->device_stopped = true;
+
 	return 0;
 }
 
@@ -1074,6 +1076,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm_lock(dqm);
 	/* clear hang status when driver try to start the hw scheduler */
 	dqm->is_hws_hang = false;
+	dqm->device_stopped = false;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
 
@@ -1089,6 +1092,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 {
 	dqm_lock(dqm);
 	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	dqm->device_stopped = true;
 	dqm_unlock(dqm);
 
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
@@ -1275,9 +1279,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 {
 	int retval;
 
+	if (dqm->device_stopped)
+		return 0;
 	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
 		return 0;
-
 	if (dqm->active_runlist)
 		return 0;
 
@@ -1299,6 +1304,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 {
 	int retval = 0;
 
+	if (dqm->device_stopped)
+		return 0;
 	if (dqm->is_hws_hang)
 		return -EIO;
 	if (!dqm->active_runlist)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 2eaea6b04cbe..44ecdf999ca8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -201,6 +201,7 @@ struct device_queue_manager {
 	bool			is_hws_hang;
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
+	bool			device_stopped;
 };
 
 void device_queue_manager_init_cik(
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found] ` <20191021210436.10571-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-21 23:34   ` Zeng, Oak
  2019-10-22  1:03   ` Kuehling, Felix
  1 sibling, 0 replies; 6+ messages in thread
From: Zeng, Oak @ 2019-10-21 23:34 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yang, Philip

Nice fix. Reviewed-by: Oak Zeng <Oak.Zeng@amd.com>

Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Yang, Philip
Sent: Monday, October 21, 2019 5:05 PM
To: amd-gfx@lists.freedesktop.org
Cc: Yang, Philip <Philip.Yang@amd.com>
Subject: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume

If device reset/suspend/resume failed for some reason, dqm lock is hold forever and this causes deadlock. Below is a kernel backtrace when application open kfd after suspend/resume failed.

Instead of holding dqm lock in pre_reset and releasing dqm lock in post_reset, add dqm->device_stopped flag which is modified in
dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
because write/read are all inside dqm lock.

For HWS case, map_queues_cpsch and unmap_queues_cpsch checks device_stopped flag before sending the updated runlist.

Backtrace of dqm lock deadlock:

[Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more than 120 seconds.
[Thu Oct 17 16:43:37 2019]       Not tainted
5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1 [Thu Oct 17 16:43:37 2019] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
0x80000000
[Thu Oct 17 16:43:37 2019] Call Trace:
[Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0 [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70 [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
[Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0 [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0 [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]
kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu] [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
[amdgpu]
[Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0 [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0 [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200 [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110 [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120 [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20 [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200 [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0 [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20 [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100 [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c            |  6 +++---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c             |  5 -----
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++++---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h   |  1 +
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d9e36dbf13d5..40d75c39f08e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
+	if (kfd_is_locked())
+		return -EAGAIN;
+
 	process = kfd_create_process(filep);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
-	if (kfd_is_locked())
-		return -EAGAIN;
-
 	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
 		process->pasid, process->is_32bit_user_mode);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8f4b24e84964..4fa8834ce7cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 		return 0;
 	kgd2kfd_suspend(kfd);
 
-	/* hold dqm->lock to prevent further execution*/
-	dqm_lock(kfd->dqm);
-
 	kfd_signal_reset_event(kfd);
 	return 0;
 }
@@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 	if (!kfd->init_complete)
 		return 0;
 
-	dqm_unlock(kfd->dqm);
-
 	ret = kfd_resume(kfd);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 81fb545cf42c..04a40fabe9d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -915,7 +915,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
 	
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
 		return pm_init(&dqm->packets, dqm);
-	
+	dqm->device_stopped = false;
+
 	return 0;
 }
 
@@ -923,7 +924,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)  {
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
 		pm_uninit(&dqm->packets);
-	
+	dqm->device_stopped = true;
+
 	return 0;
 }
 
@@ -1074,6 +1076,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm_lock(dqm);
 	/* clear hang status when driver try to start the hw scheduler */
 	dqm->is_hws_hang = false;
+	dqm->device_stopped = false;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
 
@@ -1089,6 +1092,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)  {
 	dqm_lock(dqm);
 	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	dqm->device_stopped = true;
 	dqm_unlock(dqm);
 
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); @@ -1275,9 +1279,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)  {
 	int retval;
 
+	if (dqm->device_stopped)
+		return 0;
 	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
 		return 0;
-
 	if (dqm->active_runlist)
 		return 0;
 
@@ -1299,6 +1304,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,  {
 	int retval = 0;
 
+	if (dqm->device_stopped)
+		return 0;
 	if (dqm->is_hws_hang)
 		return -EIO;
 	if (!dqm->active_runlist)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 2eaea6b04cbe..44ecdf999ca8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -201,6 +201,7 @@ struct device_queue_manager {
 	bool			is_hws_hang;
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
+	bool			device_stopped;
 };
 
 void device_queue_manager_init_cik(
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found] ` <20191021210436.10571-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
  2019-10-21 23:34   ` Zeng, Oak
@ 2019-10-22  1:03   ` Kuehling, Felix
       [not found]     ` <fbdfee2b-e9f9-598c-14f3-719429b9f374-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 6+ messages in thread
From: Kuehling, Felix @ 2019-10-22  1:03 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


On 2019-10-21 5:04 p.m., Yang, Philip wrote:
> If device reset/suspend/resume failed for some reason, dqm lock is
> hold forever and this causes deadlock. Below is a kernel backtrace when
> application open kfd after suspend/resume failed.
>
> Instead of holding dqm lock in pre_reset and releasing dqm lock in
> post_reset, add dqm->device_stopped flag which is modified in
> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
> because write/read are all inside dqm lock.
>
> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
> device_stopped flag before sending the updated runlist.

What about the non-HWS case?

In theory in non-HWS case new queues should be created in evicted state 
while the device (and all processes) are suspended. So we should never 
try to map or unmap queues to HQDs during suspend. But I'd feel better 
with a WARN_ON and error return in the right places to make sure we're 
not missing anything. Basically, we can't call any of the 
load/destroy_mqd functions while suspended.

That reminds me, we also have to add some checks in the debugfs code to 
avoid dumping HQDs of a DQM that's stopped.

Last comment: dqm->device_stopped must be initialized as true. It will 
get set to false when the device is first started. It may be easier to 
reverse the logic, something like dqm->sched_running that gets 
implicitly initialized as false.

Regards,
   Felix


>
> Backtrace of dqm lock deadlock:
>
> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
> than 120 seconds.
> [Thu Oct 17 16:43:37 2019]       Not tainted
> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
> [Thu Oct 17 16:43:37 2019] "echo 0 >
> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
> 0x80000000
> [Thu Oct 17 16:43:37 2019] Call Trace:
> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]
> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c            |  6 +++---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c             |  5 -----
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++++---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h   |  1 +
>   4 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index d9e36dbf13d5..40d75c39f08e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>   		return -EPERM;
>   	}
>   
> +	if (kfd_is_locked())
> +		return -EAGAIN;
> +
>   	process = kfd_create_process(filep);
>   	if (IS_ERR(process))
>   		return PTR_ERR(process);
>   
> -	if (kfd_is_locked())
> -		return -EAGAIN;
> -
>   	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>   		process->pasid, process->is_32bit_user_mode);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 8f4b24e84964..4fa8834ce7cb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   		return 0;
>   	kgd2kfd_suspend(kfd);
>   
> -	/* hold dqm->lock to prevent further execution*/
> -	dqm_lock(kfd->dqm);
> -
>   	kfd_signal_reset_event(kfd);
>   	return 0;
>   }
> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>   	if (!kfd->init_complete)
>   		return 0;
>   
> -	dqm_unlock(kfd->dqm);
> -
>   	ret = kfd_resume(kfd);
>   	if (ret)
>   		return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 81fb545cf42c..04a40fabe9d7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -915,7 +915,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>   	
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		return pm_init(&dqm->packets, dqm);
> -	
> +	dqm->device_stopped = false;
> +
>   	return 0;
>   }
>   
> @@ -923,7 +924,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>   {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		pm_uninit(&dqm->packets);
> -	
> +	dqm->device_stopped = true;
> +
>   	return 0;
>   }
>   
> @@ -1074,6 +1076,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm_lock(dqm);
>   	/* clear hang status when driver try to start the hw scheduler */
>   	dqm->is_hws_hang = false;
> +	dqm->device_stopped = false;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
>   
> @@ -1089,6 +1092,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   {
>   	dqm_lock(dqm);
>   	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	dqm->device_stopped = true;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> @@ -1275,9 +1279,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>   {
>   	int retval;
>   
> +	if (dqm->device_stopped)
> +		return 0;
>   	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>   		return 0;
> -
>   	if (dqm->active_runlist)
>   		return 0;
>   
> @@ -1299,6 +1304,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   {
>   	int retval = 0;
>   
> +	if (dqm->device_stopped)
> +		return 0;
>   	if (dqm->is_hws_hang)
>   		return -EIO;
>   	if (!dqm->active_runlist)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 2eaea6b04cbe..44ecdf999ca8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -201,6 +201,7 @@ struct device_queue_manager {
>   	bool			is_hws_hang;
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
> +	bool			device_stopped;
>   };
>   
>   void device_queue_manager_init_cik(
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]     ` <fbdfee2b-e9f9-598c-14f3-719429b9f374-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22  2:02       ` Zeng, Oak
       [not found]         ` <BL0PR12MB2580AD0DAF7CCB42A29F82E980680-b4cIHhjg/p/XzH18dTCKOgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2019-10-22 15:57       ` Yang, Philip
  1 sibling, 1 reply; 6+ messages in thread
From: Zeng, Oak @ 2019-10-22  2:02 UTC (permalink / raw)
  To: Kuehling, Felix, Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

If we decline the queue creation request in suspend state by returning -EAGAIN, then this approach works for both hws and non-hws. This way the driver is clean but application need to re-create queue later when it get a EAGAIN. Currently application is not aware of the suspend/resume state, so it is hard for application to know when to re-create queue.

The main benefit to allowing create queue in suspend state is that it is easier for application writer. But no actual performance gain as no task will be executed in suspend state.

Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Kuehling, Felix
Sent: Monday, October 21, 2019 9:04 PM
To: Yang, Philip <Philip.Yang@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume


On 2019-10-21 5:04 p.m., Yang, Philip wrote:
> If device reset/suspend/resume failed for some reason, dqm lock is 
> hold forever and this causes deadlock. Below is a kernel backtrace 
> when application open kfd after suspend/resume failed.
>
> Instead of holding dqm lock in pre_reset and releasing dqm lock in 
> post_reset, add dqm->device_stopped flag which is modified in
> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock 
> dqm->protection
> because write/read are all inside dqm lock.
>
> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks 
> device_stopped flag before sending the updated runlist.

What about the non-HWS case?

In theory in non-HWS case new queues should be created in evicted state while the device (and all processes) are suspended. So we should never try to map or unmap queues to HQDs during suspend. But I'd feel better with a WARN_ON and error return in the right places to make sure we're not missing anything. Basically, we can't call any of the load/destroy_mqd functions while suspended.

That reminds me, we also have to add some checks in the debugfs code to avoid dumping HQDs of a DQM that's stopped.

Last comment: dqm->device_stopped must be initialized as true. It will get set to false when the device is first started. It may be easier to reverse the logic, something like dqm->sched_running that gets implicitly initialized as false.

Regards,
   Felix


>
> Backtrace of dqm lock deadlock:
>
> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more 
> than 120 seconds.
> [Thu Oct 17 16:43:37 2019]       Not tainted
> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1 [Thu Oct 17 16:43:37 
> 2019] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
> message.
> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
> 0x80000000
> [Thu Oct 17 16:43:37 2019] Call Trace:
> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0 [Thu Oct 17 
> 16:43:37 2019]  schedule+0x32/0x70 [Thu Oct 17 16:43:37 2019]  
> schedule_preempt_disabled+0xa/0x10
> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0 [Thu Oct 
> 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0 [Thu Oct 17 16:43:37 
> 2019]  ? process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]
> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu] [Thu Oct 17 
> 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0 [Thu Oct 
> 17 16:43:37 2019]  exit_mmap+0x160/0x1a0 [Thu Oct 17 16:43:37 2019]  ? 
> __handle_mm_fault+0xba3/0x1200 [Thu Oct 17 16:43:37 2019]  ? 
> exit_robust_list+0x5a/0x110 [Thu Oct 17 16:43:37 2019]  
> mmput+0x4a/0x120 [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20 [Thu 
> Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200 [Thu Oct 17 
> 16:43:37 2019]  do_group_exit+0x3a/0xa0 [Thu Oct 17 16:43:37 2019]  
> __x64_sys_exit_group+0x14/0x20 [Thu Oct 17 16:43:37 2019]  
> do_syscall_64+0x4f/0x100 [Thu Oct 17 16:43:37 2019]  
> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c            |  6 +++---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c             |  5 -----
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++++---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h   |  1 +
>   4 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index d9e36dbf13d5..40d75c39f08e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>   		return -EPERM;
>   	}
>   
> +	if (kfd_is_locked())
> +		return -EAGAIN;
> +
>   	process = kfd_create_process(filep);
>   	if (IS_ERR(process))
>   		return PTR_ERR(process);
>   
> -	if (kfd_is_locked())
> -		return -EAGAIN;
> -
>   	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>   		process->pasid, process->is_32bit_user_mode);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 8f4b24e84964..4fa8834ce7cb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   		return 0;
>   	kgd2kfd_suspend(kfd);
>   
> -	/* hold dqm->lock to prevent further execution*/
> -	dqm_lock(kfd->dqm);
> -
>   	kfd_signal_reset_event(kfd);
>   	return 0;
>   }
> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>   	if (!kfd->init_complete)
>   		return 0;
>   
> -	dqm_unlock(kfd->dqm);
> -
>   	ret = kfd_resume(kfd);
>   	if (ret)
>   		return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 81fb545cf42c..04a40fabe9d7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -915,7 +915,8 @@ static int start_nocpsch(struct 
> device_queue_manager *dqm)
>   	
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		return pm_init(&dqm->packets, dqm);
> -	
> +	dqm->device_stopped = false;
> +
>   	return 0;
>   }
>   
> @@ -923,7 +924,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>   {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		pm_uninit(&dqm->packets);
> -	
> +	dqm->device_stopped = true;
> +
>   	return 0;
>   }
>   
> @@ -1074,6 +1076,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm_lock(dqm);
>   	/* clear hang status when driver try to start the hw scheduler */
>   	dqm->is_hws_hang = false;
> +	dqm->device_stopped = false;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
>   
> @@ -1089,6 +1092,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   {
>   	dqm_lock(dqm);
>   	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	dqm->device_stopped = true;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); @@ -1275,9 +1279,10 @@ 
> static int map_queues_cpsch(struct device_queue_manager *dqm)
>   {
>   	int retval;
>   
> +	if (dqm->device_stopped)
> +		return 0;
>   	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>   		return 0;
> -
>   	if (dqm->active_runlist)
>   		return 0;
>   
> @@ -1299,6 +1304,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   {
>   	int retval = 0;
>   
> +	if (dqm->device_stopped)
> +		return 0;
>   	if (dqm->is_hws_hang)
>   		return -EIO;
>   	if (!dqm->active_runlist)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 2eaea6b04cbe..44ecdf999ca8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -201,6 +201,7 @@ struct device_queue_manager {
>   	bool			is_hws_hang;
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
> +	bool			device_stopped;
>   };
>   
>   void device_queue_manager_init_cik(
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]         ` <BL0PR12MB2580AD0DAF7CCB42A29F82E980680-b4cIHhjg/p/XzH18dTCKOgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2019-10-22 13:46           ` Kuehling, Felix
  0 siblings, 0 replies; 6+ messages in thread
From: Kuehling, Felix @ 2019-10-22 13:46 UTC (permalink / raw)
  To: Zeng, Oak, Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-10-21 22:02, Zeng, Oak wrote:
> If we decline the queue creation request in suspend state by returning -EAGAIN, then this approach works for both hws and non-hws. This way the driver is clean but application need to re-create queue later when it get a EAGAIN. Currently application is not aware of the suspend/resume state, so it is hard for application to know when to re-create queue.
>
> The main benefit to allowing create queue in suspend state is that it is easier for application writer. But no actual performance gain as no task will be executed in suspend state.

We should not need to prevent queue creation while suspended. The 
processes are suspended. That means new queues will be created in 
evicted state:

         /*
          * Eviction state logic: mark all queues as evicted, even ones
          * not currently active. Restoring inactive queues later only
          * updates the is_evicted flag but is a no-op otherwise.
          */
         q->properties.is_evicted = !!qpd->evicted;

mqd_mgr->load_mqd will only be called for active queues. So even in the 
non-HWS case we should not be touching the HW while suspended. But I'd 
like to see some safeguards in place to make sure those assumptions are 
never violated.

Regards,
   Felix


>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Kuehling, Felix
> Sent: Monday, October 21, 2019 9:04 PM
> To: Yang, Philip <Philip.Yang@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
>
>
> On 2019-10-21 5:04 p.m., Yang, Philip wrote:
>> If device reset/suspend/resume failed for some reason, dqm lock is
>> hold forever and this causes deadlock. Below is a kernel backtrace
>> when application open kfd after suspend/resume failed.
>>
>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>> post_reset, add dqm->device_stopped flag which is modified in
>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock
>> dqm->protection
>> because write/read are all inside dqm lock.
>>
>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>> device_stopped flag before sending the updated runlist.
> What about the non-HWS case?
>
> In theory in non-HWS case new queues should be created in evicted state while the device (and all processes) are suspended. So we should never try to map or unmap queues to HQDs during suspend. But I'd feel better with a WARN_ON and error return in the right places to make sure we're not missing anything. Basically, we can't call any of the load/destroy_mqd functions while suspended.
>
> That reminds me, we also have to add some checks in the debugfs code to avoid dumping HQDs of a DQM that's stopped.
>
> Last comment: dqm->device_stopped must be initialized as true. It will get set to false when the device is first started. It may be easier to reverse the logic, something like dqm->sched_running that gets implicitly initialized as false.
>
> Regards,
>     Felix
>
>
>> Backtrace of dqm lock deadlock:
>>
>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>> than 120 seconds.
>> [Thu Oct 17 16:43:37 2019]       Not tainted
>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1 [Thu Oct 17 16:43:37
>> 2019] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
>> message.
>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>> 0x80000000
>> [Thu Oct 17 16:43:37 2019] Call Trace:
>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0 [Thu Oct 17
>> 16:43:37 2019]  schedule+0x32/0x70 [Thu Oct 17 16:43:37 2019]
>> schedule_preempt_disabled+0xa/0x10
>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0 [Thu Oct
>> 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0 [Thu Oct 17 16:43:37
>> 2019]  ? process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]
>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu] [Thu Oct 17
>> 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0 [Thu Oct
>> 17 16:43:37 2019]  exit_mmap+0x160/0x1a0 [Thu Oct 17 16:43:37 2019]  ?
>> __handle_mm_fault+0xba3/0x1200 [Thu Oct 17 16:43:37 2019]  ?
>> exit_robust_list+0x5a/0x110 [Thu Oct 17 16:43:37 2019]
>> mmput+0x4a/0x120 [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20 [Thu
>> Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200 [Thu Oct 17
>> 16:43:37 2019]  do_group_exit+0x3a/0xa0 [Thu Oct 17 16:43:37 2019]
>> __x64_sys_exit_group+0x14/0x20 [Thu Oct 17 16:43:37 2019]
>> do_syscall_64+0x4f/0x100 [Thu Oct 17 16:43:37 2019]
>> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>
>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c            |  6 +++---
>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c             |  5 -----
>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++++---
>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h   |  1 +
>>    4 files changed, 14 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index d9e36dbf13d5..40d75c39f08e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>    		return -EPERM;
>>    	}
>>    
>> +	if (kfd_is_locked())
>> +		return -EAGAIN;
>> +
>>    	process = kfd_create_process(filep);
>>    	if (IS_ERR(process))
>>    		return PTR_ERR(process);
>>    
>> -	if (kfd_is_locked())
>> -		return -EAGAIN;
>> -
>>    	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>    		process->pasid, process->is_32bit_user_mode);
>>    
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 8f4b24e84964..4fa8834ce7cb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>    		return 0;
>>    	kgd2kfd_suspend(kfd);
>>    
>> -	/* hold dqm->lock to prevent further execution*/
>> -	dqm_lock(kfd->dqm);
>> -
>>    	kfd_signal_reset_event(kfd);
>>    	return 0;
>>    }
>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>    	if (!kfd->init_complete)
>>    		return 0;
>>    
>> -	dqm_unlock(kfd->dqm);
>> -
>>    	ret = kfd_resume(kfd);
>>    	if (ret)
>>    		return ret;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 81fb545cf42c..04a40fabe9d7 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -915,7 +915,8 @@ static int start_nocpsch(struct
>> device_queue_manager *dqm)
>>    	
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		return pm_init(&dqm->packets, dqm);
>> -	
>> +	dqm->device_stopped = false;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -923,7 +924,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>    {
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		pm_uninit(&dqm->packets);
>> -	
>> +	dqm->device_stopped = true;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -1074,6 +1076,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>    	dqm_lock(dqm);
>>    	/* clear hang status when driver try to start the hw scheduler */
>>    	dqm->is_hws_hang = false;
>> +	dqm->device_stopped = false;
>>    	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>    	dqm_unlock(dqm);
>>    
>> @@ -1089,6 +1092,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	dqm_lock(dqm);
>>    	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +	dqm->device_stopped = true;
>>    	dqm_unlock(dqm);
>>    
>>    	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); @@ -1275,9 +1279,10 @@
>> static int map_queues_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	int retval;
>>    
>> +	if (dqm->device_stopped)
>> +		return 0;
>>    	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>    		return 0;
>> -
>>    	if (dqm->active_runlist)
>>    		return 0;
>>    
>> @@ -1299,6 +1304,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>    {
>>    	int retval = 0;
>>    
>> +	if (dqm->device_stopped)
>> +		return 0;
>>    	if (dqm->is_hws_hang)
>>    		return -EIO;
>>    	if (!dqm->active_runlist)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> index 2eaea6b04cbe..44ecdf999ca8 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>    	bool			is_hws_hang;
>>    	struct work_struct	hw_exception_work;
>>    	struct kfd_mem_obj	hiq_sdma_mqd;
>> +	bool			device_stopped;
>>    };
>>    
>>    void device_queue_manager_init_cik(
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]     ` <fbdfee2b-e9f9-598c-14f3-719429b9f374-5C7GfCeVMHo@public.gmane.org>
  2019-10-22  2:02       ` Zeng, Oak
@ 2019-10-22 15:57       ` Yang, Philip
  1 sibling, 0 replies; 6+ messages in thread
From: Yang, Philip @ 2019-10-22 15:57 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2019-10-21 9:03 p.m., Kuehling, Felix wrote:
> 
> On 2019-10-21 5:04 p.m., Yang, Philip wrote:
>> If device reset/suspend/resume failed for some reason, dqm lock is
>> hold forever and this causes deadlock. Below is a kernel backtrace when
>> application open kfd after suspend/resume failed.
>>
>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>> post_reset, add dqm->device_stopped flag which is modified in
>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
>> because write/read are all inside dqm lock.
>>
>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>> device_stopped flag before sending the updated runlist.
> 
> What about the non-HWS case?
> 
> In theory in non-HWS case new queues should be created in evicted state
> while the device (and all processes) are suspended. So we should never
> try to map or unmap queues to HQDs during suspend. But I'd feel better
> with a WARN_ON and error return in the right places to make sure we're
> not missing anything. Basically, we can't call any of the
> load/destroy_mqd functions while suspended.
> 
v2 patch add non-HWS case.

> That reminds me, we also have to add some checks in the debugfs code to
> avoid dumping HQDs of a DQM that's stopped.
>
Thanks, done in v2 patch

> Last comment: dqm->device_stopped must be initialized as true. It will
> get set to false when the device is first started. It may be easier to
> reverse the logic, something like dqm->sched_running that gets
> implicitly initialized as false.
> 
Change to dqm->sched_running in v2 patch.

Regards,
Philip

> Regards,
>     Felix
> 
> 
>>
>> Backtrace of dqm lock deadlock:
>>
>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>> than 120 seconds.
>> [Thu Oct 17 16:43:37 2019]       Not tainted
>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
>> [Thu Oct 17 16:43:37 2019] "echo 0 >
>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>> 0x80000000
>> [Thu Oct 17 16:43:37 2019] Call Trace:
>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
>> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
>> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
>> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
>> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]
>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
>> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
>> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
>> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
>> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
>> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
>> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
>> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
>> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
>> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
>> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>
>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c            |  6 +++---
>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c             |  5 -----
>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++++---
>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h   |  1 +
>>    4 files changed, 14 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index d9e36dbf13d5..40d75c39f08e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>    		return -EPERM;
>>    	}
>>    
>> +	if (kfd_is_locked())
>> +		return -EAGAIN;
>> +
>>    	process = kfd_create_process(filep);
>>    	if (IS_ERR(process))
>>    		return PTR_ERR(process);
>>    
>> -	if (kfd_is_locked())
>> -		return -EAGAIN;
>> -
>>    	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>    		process->pasid, process->is_32bit_user_mode);
>>    
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 8f4b24e84964..4fa8834ce7cb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>    		return 0;
>>    	kgd2kfd_suspend(kfd);
>>    
>> -	/* hold dqm->lock to prevent further execution*/
>> -	dqm_lock(kfd->dqm);
>> -
>>    	kfd_signal_reset_event(kfd);
>>    	return 0;
>>    }
>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>    	if (!kfd->init_complete)
>>    		return 0;
>>    
>> -	dqm_unlock(kfd->dqm);
>> -
>>    	ret = kfd_resume(kfd);
>>    	if (ret)
>>    		return ret;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 81fb545cf42c..04a40fabe9d7 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -915,7 +915,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>>    	
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		return pm_init(&dqm->packets, dqm);
>> -	
>> +	dqm->device_stopped = false;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -923,7 +924,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>    {
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		pm_uninit(&dqm->packets);
>> -	
>> +	dqm->device_stopped = true;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -1074,6 +1076,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>    	dqm_lock(dqm);
>>    	/* clear hang status when driver try to start the hw scheduler */
>>    	dqm->is_hws_hang = false;
>> +	dqm->device_stopped = false;
>>    	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>    	dqm_unlock(dqm);
>>    
>> @@ -1089,6 +1092,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	dqm_lock(dqm);
>>    	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +	dqm->device_stopped = true;
>>    	dqm_unlock(dqm);
>>    
>>    	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>> @@ -1275,9 +1279,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	int retval;
>>    
>> +	if (dqm->device_stopped)
>> +		return 0;
>>    	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>    		return 0;
>> -
>>    	if (dqm->active_runlist)
>>    		return 0;
>>    
>> @@ -1299,6 +1304,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>    {
>>    	int retval = 0;
>>    
>> +	if (dqm->device_stopped)
>> +		return 0;
>>    	if (dqm->is_hws_hang)
>>    		return -EIO;
>>    	if (!dqm->active_runlist)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> index 2eaea6b04cbe..44ecdf999ca8 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>    	bool			is_hws_hang;
>>    	struct work_struct	hw_exception_work;
>>    	struct kfd_mem_obj	hiq_sdma_mqd;
>> +	bool			device_stopped;
>>    };
>>    
>>    void device_queue_manager_init_cik(
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-10-22 15:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-21 21:04 [PATCH] drm/amdkfd: don't use dqm lock during device reset/suspend/resume Yang, Philip
     [not found] ` <20191021210436.10571-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
2019-10-21 23:34   ` Zeng, Oak
2019-10-22  1:03   ` Kuehling, Felix
     [not found]     ` <fbdfee2b-e9f9-598c-14f3-719429b9f374-5C7GfCeVMHo@public.gmane.org>
2019-10-22  2:02       ` Zeng, Oak
     [not found]         ` <BL0PR12MB2580AD0DAF7CCB42A29F82E980680-b4cIHhjg/p/XzH18dTCKOgdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2019-10-22 13:46           ` Kuehling, Felix
2019-10-22 15:57       ` Yang, Philip

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.