All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
@ 2019-10-22 18:28 Yang, Philip
       [not found] ` <20191022182804.23845-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Yang, Philip @ 2019-10-22 18:28 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Yang, Philip

If device reset/suspend/resume failed for some reason, dqm lock is
hold forever and this causes deadlock. Below is a kernel backtrace when
application open kfd after suspend/resume failed.

Instead of holding dqm lock in pre_reset and releasing dqm lock in
post_reset, add dqm->device_stopped flag which is modified in
dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
because write/read are all inside dqm lock.

For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
device_stopped flag before sending the updated runlist.

v2: For no-HWS case, when device is stopped, don't call
load/destroy_mqd for eviction, restore and create queue, and avoid
debugfs dump hdqs.

Backtrace of dqm lock deadlock:

[Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
than 120 seconds.
[Thu Oct 17 16:43:37 2019]       Not tainted
5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
[Thu Oct 17 16:43:37 2019] "echo 0 >
/proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
0x80000000
[Thu Oct 17 16:43:37 2019] Call Trace:
[Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
[Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
[Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
[Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
[Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
[Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]
kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
[Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
[amdgpu]
[Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
[Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
[Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
[Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
[Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
[Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
[Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
[Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
[Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
[Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
[Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
 4 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d9e36dbf13d5..40d75c39f08e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
+	if (kfd_is_locked())
+		return -EAGAIN;
+
 	process = kfd_create_process(filep);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
-	if (kfd_is_locked())
-		return -EAGAIN;
-
 	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
 		process->pasid, process->is_32bit_user_mode);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8f4b24e84964..4fa8834ce7cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 		return 0;
 	kgd2kfd_suspend(kfd);
 
-	/* hold dqm->lock to prevent further execution*/
-	dqm_lock(kfd->dqm);
-
 	kfd_signal_reset_event(kfd);
 	return 0;
 }
@@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 	if (!kfd->init_complete)
 		return 0;
 
-	dqm_unlock(kfd->dqm);
-
 	ret = kfd_resume(kfd);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 81fb545cf42c..82e1c6280d13 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
 				&q->gart_mqd_addr, &q->properties);
 	if (q->properties.is_active) {
+		if (!dqm->sched_running) {
+			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
+			goto add_queue_to_list;
+		}
 
 		if (WARN(q->process->mm != current->mm,
 					"should only run in user thread"))
@@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 			goto out_free_mqd;
 	}
 
+add_queue_to_list:
 	list_add(&q->list, &qpd->queues_list);
 	qpd->queue_count++;
 	if (q->properties.is_active)
@@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 
 	deallocate_doorbell(qpd, q);
 
+	if (!dqm->sched_running) {
+		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
+		return 0;
+	}
+
 	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
 				KFD_UNMAP_LATENCY_MS,
@@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
 		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
 		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
+
+		if (!dqm->sched_running) {
+			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
+			goto out_unlock;
+		}
+
 		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
 				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
@@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 				q->properties.type)];
 		q->properties.is_active = false;
+		dqm->queue_count--;
+
+		if (!dqm->sched_running)
+			continue;
+
 		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
 				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
@@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 			 * maintain a consistent eviction state
 			 */
 			ret = retval;
-		dqm->queue_count--;
 	}
 
 out:
@@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 				q->properties.type)];
 		q->properties.is_active = true;
+		dqm->queue_count++;
+
+		if (!dqm->sched_running)
+			continue;
+
 		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
 				       q->queue, &q->properties, mm);
 		if (retval && !ret)
@@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 			 * maintain a consistent eviction state
 			 */
 			ret = retval;
-		dqm->queue_count++;
 	}
 	qpd->evicted = 0;
 out:
@@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
 	
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
 		return pm_init(&dqm->packets, dqm);
-	
+	dqm->sched_running = true;
+
 	return 0;
 }
 
@@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
 {
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
 		pm_uninit(&dqm->packets);
-	
+	dqm->sched_running = false;
+
 	return 0;
 }
 
@@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm_lock(dqm);
 	/* clear hang status when driver try to start the hw scheduler */
 	dqm->is_hws_hang = false;
+	dqm->sched_running = true;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
 
@@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 {
 	dqm_lock(dqm);
 	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	dqm->sched_running = false;
 	dqm_unlock(dqm);
 
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
@@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 {
 	int retval;
 
+	if (!dqm->sched_running)
+		return 0;
 	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
 		return 0;
-
 	if (dqm->active_runlist)
 		return 0;
 
@@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 {
 	int retval = 0;
 
+	if (!dqm->sched_running)
+		return 0;
 	if (dqm->is_hws_hang)
 		return -EIO;
 	if (!dqm->active_runlist)
@@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
 	int pipe, queue;
 	int r = 0;
 
+	if (!dqm->sched_running) {
+		seq_printf(m, " Device is stopped\n");
+
+		return 0;
+	}
+
 	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
 					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
 					&dump, &n_regs);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 2eaea6b04cbe..a8c37e6da027 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -201,6 +201,7 @@ struct device_queue_manager {
 	bool			is_hws_hang;
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
+	bool			sched_running;
 };
 
 void device_queue_manager_init_cik(
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found] ` <20191022182804.23845-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22 18:38   ` Grodzovsky, Andrey
       [not found]     ` <83bfc853-cb0e-4285-d310-232683130751-5C7GfCeVMHo@public.gmane.org>
  2019-10-22 18:44   ` Kuehling, Felix
  1 sibling, 1 reply; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-22 18:38 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 10/22/19 2:28 PM, Yang, Philip wrote:
> If device reset/suspend/resume failed for some reason, dqm lock is
> hold forever and this causes deadlock. Below is a kernel backtrace when
> application open kfd after suspend/resume failed.
>
> Instead of holding dqm lock in pre_reset and releasing dqm lock in
> post_reset, add dqm->device_stopped flag which is modified in
> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
> because write/read are all inside dqm lock.
>
> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
> device_stopped flag before sending the updated runlist.


Is there a chance of race condition here where dqm->device_stopped 
returns true for some operation (e.g.map_queues_cpsch) but just as it 
proceeds GPU reset starts  ?

Andrey


>
> v2: For no-HWS case, when device is stopped, don't call
> load/destroy_mqd for eviction, restore and create queue, and avoid
> debugfs dump hdqs.
>
> Backtrace of dqm lock deadlock:
>
> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
> than 120 seconds.
> [Thu Oct 17 16:43:37 2019]       Not tainted
> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
> [Thu Oct 17 16:43:37 2019] "echo 0 >
> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
> 0x80000000
> [Thu Oct 17 16:43:37 2019] Call Trace:
> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]
> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>   4 files changed, 46 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index d9e36dbf13d5..40d75c39f08e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>   		return -EPERM;
>   	}
>   
> +	if (kfd_is_locked())
> +		return -EAGAIN;
> +
>   	process = kfd_create_process(filep);
>   	if (IS_ERR(process))
>   		return PTR_ERR(process);
>   
> -	if (kfd_is_locked())
> -		return -EAGAIN;
> -
>   	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>   		process->pasid, process->is_32bit_user_mode);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 8f4b24e84964..4fa8834ce7cb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   		return 0;
>   	kgd2kfd_suspend(kfd);
>   
> -	/* hold dqm->lock to prevent further execution*/
> -	dqm_lock(kfd->dqm);
> -
>   	kfd_signal_reset_event(kfd);
>   	return 0;
>   }
> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>   	if (!kfd->init_complete)
>   		return 0;
>   
> -	dqm_unlock(kfd->dqm);
> -
>   	ret = kfd_resume(kfd);
>   	if (ret)
>   		return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 81fb545cf42c..82e1c6280d13 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>   				&q->gart_mqd_addr, &q->properties);
>   	if (q->properties.is_active) {
> +		if (!dqm->sched_running) {
> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
> +			goto add_queue_to_list;
> +		}
>   
>   		if (WARN(q->process->mm != current->mm,
>   					"should only run in user thread"))
> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   			goto out_free_mqd;
>   	}
>   
> +add_queue_to_list:
>   	list_add(&q->list, &qpd->queues_list);
>   	qpd->queue_count++;
>   	if (q->properties.is_active)
> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   
>   	deallocate_doorbell(qpd, q);
>   
> +	if (!dqm->sched_running) {
> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
> +		return 0;
> +	}
> +
>   	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>   				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>   				KFD_UNMAP_LATENCY_MS,
> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>   		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>   		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> +
> +		if (!dqm->sched_running) {
> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
> +			goto out_unlock;
> +		}
> +
>   		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>   				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>   				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = false;
> +		dqm->queue_count--;
> +
> +		if (!dqm->sched_running)
> +			continue;
> +
>   		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>   				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>   				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>   			 * maintain a consistent eviction state
>   			 */
>   			ret = retval;
> -		dqm->queue_count--;
>   	}
>   
>   out:
> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = true;
> +		dqm->queue_count++;
> +
> +		if (!dqm->sched_running)
> +			continue;
> +
>   		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>   				       q->queue, &q->properties, mm);
>   		if (retval && !ret)
> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>   			 * maintain a consistent eviction state
>   			 */
>   			ret = retval;
> -		dqm->queue_count++;
>   	}
>   	qpd->evicted = 0;
>   out:
> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>   	
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		return pm_init(&dqm->packets, dqm);
> -	
> +	dqm->sched_running = true;
> +
>   	return 0;
>   }
>   
> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>   {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		pm_uninit(&dqm->packets);
> -	
> +	dqm->sched_running = false;
> +
>   	return 0;
>   }
>   
> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm_lock(dqm);
>   	/* clear hang status when driver try to start the hw scheduler */
>   	dqm->is_hws_hang = false;
> +	dqm->sched_running = true;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
>   
> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   {
>   	dqm_lock(dqm);
>   	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	dqm->sched_running = false;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>   {
>   	int retval;
>   
> +	if (!dqm->sched_running)
> +		return 0;
>   	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>   		return 0;
> -
>   	if (dqm->active_runlist)
>   		return 0;
>   
> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   {
>   	int retval = 0;
>   
> +	if (!dqm->sched_running)
> +		return 0;
>   	if (dqm->is_hws_hang)
>   		return -EIO;
>   	if (!dqm->active_runlist)
> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>   	int pipe, queue;
>   	int r = 0;
>   
> +	if (!dqm->sched_running) {
> +		seq_printf(m, " Device is stopped\n");
> +
> +		return 0;
> +	}
> +
>   	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>   					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>   					&dump, &n_regs);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 2eaea6b04cbe..a8c37e6da027 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -201,6 +201,7 @@ struct device_queue_manager {
>   	bool			is_hws_hang;
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
> +	bool			sched_running;
>   };
>   
>   void device_queue_manager_init_cik(
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]     ` <83bfc853-cb0e-4285-d310-232683130751-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22 18:40       ` Grodzovsky, Andrey
       [not found]         ` <b7c3ad13-0b36-5fbb-5079-4a72a8dd853a-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-22 18:40 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


On 10/22/19 2:38 PM, Grodzovsky, Andrey wrote:
> On 10/22/19 2:28 PM, Yang, Philip wrote:
>> If device reset/suspend/resume failed for some reason, dqm lock is
>> hold forever and this causes deadlock. Below is a kernel backtrace when
>> application open kfd after suspend/resume failed.
>>
>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>> post_reset, add dqm->device_stopped flag which is modified in
>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
>> because write/read are all inside dqm lock.
>>
>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>> device_stopped flag before sending the updated runlist.
>
> Is there a chance of race condition here where dqm->device_stopped
> returns true for some operation (e.g.map_queues_cpsch) but just as it
> proceeds GPU reset starts  ?
>
> Andrey


Correction -

dqm->device_stopped returns FALSE

Andrey

>
>
>> v2: For no-HWS case, when device is stopped, don't call
>> load/destroy_mqd for eviction, restore and create queue, and avoid
>> debugfs dump hdqs.
>>
>> Backtrace of dqm lock deadlock:
>>
>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>> than 120 seconds.
>> [Thu Oct 17 16:43:37 2019]       Not tainted
>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
>> [Thu Oct 17 16:43:37 2019] "echo 0 >
>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>> 0x80000000
>> [Thu Oct 17 16:43:37 2019] Call Trace:
>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
>> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
>> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
>> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
>> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]
>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
>> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
>> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
>> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
>> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
>> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
>> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
>> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
>> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
>> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
>> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>
>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>>    .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>>    .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>>    4 files changed, 46 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index d9e36dbf13d5..40d75c39f08e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>    		return -EPERM;
>>    	}
>>    
>> +	if (kfd_is_locked())
>> +		return -EAGAIN;
>> +
>>    	process = kfd_create_process(filep);
>>    	if (IS_ERR(process))
>>    		return PTR_ERR(process);
>>    
>> -	if (kfd_is_locked())
>> -		return -EAGAIN;
>> -
>>    	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>    		process->pasid, process->is_32bit_user_mode);
>>    
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 8f4b24e84964..4fa8834ce7cb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>    		return 0;
>>    	kgd2kfd_suspend(kfd);
>>    
>> -	/* hold dqm->lock to prevent further execution*/
>> -	dqm_lock(kfd->dqm);
>> -
>>    	kfd_signal_reset_event(kfd);
>>    	return 0;
>>    }
>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>    	if (!kfd->init_complete)
>>    		return 0;
>>    
>> -	dqm_unlock(kfd->dqm);
>> -
>>    	ret = kfd_resume(kfd);
>>    	if (ret)
>>    		return ret;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 81fb545cf42c..82e1c6280d13 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>    	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>>    				&q->gart_mqd_addr, &q->properties);
>>    	if (q->properties.is_active) {
>> +		if (!dqm->sched_running) {
>> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
>> +			goto add_queue_to_list;
>> +		}
>>    
>>    		if (WARN(q->process->mm != current->mm,
>>    					"should only run in user thread"))
>> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>    			goto out_free_mqd;
>>    	}
>>    
>> +add_queue_to_list:
>>    	list_add(&q->list, &qpd->queues_list);
>>    	qpd->queue_count++;
>>    	if (q->properties.is_active)
>> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>    
>>    	deallocate_doorbell(qpd, q);
>>    
>> +	if (!dqm->sched_running) {
>> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
>> +		return 0;
>> +	}
>> +
>>    	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>    				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>    				KFD_UNMAP_LATENCY_MS,
>> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>>    		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>>    		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>    		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>> +
>> +		if (!dqm->sched_running) {
>> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
>> +			goto out_unlock;
>> +		}
>> +
>>    		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>    				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>    				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>    				q->properties.type)];
>>    		q->properties.is_active = false;
>> +		dqm->queue_count--;
>> +
>> +		if (!dqm->sched_running)
>> +			continue;
>> +
>>    		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>    				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>    				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    			 * maintain a consistent eviction state
>>    			 */
>>    			ret = retval;
>> -		dqm->queue_count--;
>>    	}
>>    
>>    out:
>> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>    				q->properties.type)];
>>    		q->properties.is_active = true;
>> +		dqm->queue_count++;
>> +
>> +		if (!dqm->sched_running)
>> +			continue;
>> +
>>    		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>>    				       q->queue, &q->properties, mm);
>>    		if (retval && !ret)
>> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    			 * maintain a consistent eviction state
>>    			 */
>>    			ret = retval;
>> -		dqm->queue_count++;
>>    	}
>>    	qpd->evicted = 0;
>>    out:
>> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>>    	
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		return pm_init(&dqm->packets, dqm);
>> -	
>> +	dqm->sched_running = true;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>    {
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		pm_uninit(&dqm->packets);
>> -	
>> +	dqm->sched_running = false;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>    	dqm_lock(dqm);
>>    	/* clear hang status when driver try to start the hw scheduler */
>>    	dqm->is_hws_hang = false;
>> +	dqm->sched_running = true;
>>    	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>    	dqm_unlock(dqm);
>>    
>> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	dqm_lock(dqm);
>>    	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +	dqm->sched_running = false;
>>    	dqm_unlock(dqm);
>>    
>>    	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	int retval;
>>    
>> +	if (!dqm->sched_running)
>> +		return 0;
>>    	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>    		return 0;
>> -
>>    	if (dqm->active_runlist)
>>    		return 0;
>>    
>> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>    {
>>    	int retval = 0;
>>    
>> +	if (!dqm->sched_running)
>> +		return 0;
>>    	if (dqm->is_hws_hang)
>>    		return -EIO;
>>    	if (!dqm->active_runlist)
>> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>>    	int pipe, queue;
>>    	int r = 0;
>>    
>> +	if (!dqm->sched_running) {
>> +		seq_printf(m, " Device is stopped\n");
>> +
>> +		return 0;
>> +	}
>> +
>>    	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>>    					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>>    					&dump, &n_regs);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> index 2eaea6b04cbe..a8c37e6da027 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>    	bool			is_hws_hang;
>>    	struct work_struct	hw_exception_work;
>>    	struct kfd_mem_obj	hiq_sdma_mqd;
>> +	bool			sched_running;
>>    };
>>    
>>    void device_queue_manager_init_cik(
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found] ` <20191022182804.23845-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
  2019-10-22 18:38   ` Grodzovsky, Andrey
@ 2019-10-22 18:44   ` Kuehling, Felix
       [not found]     ` <558b2dd3-8095-81e0-547b-788b74c9b329-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 9+ messages in thread
From: Kuehling, Felix @ 2019-10-22 18:44 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-10-22 14:28, Yang, Philip wrote:
> If device reset/suspend/resume failed for some reason, dqm lock is
> hold forever and this causes deadlock. Below is a kernel backtrace when
> application open kfd after suspend/resume failed.
>
> Instead of holding dqm lock in pre_reset and releasing dqm lock in
> post_reset, add dqm->device_stopped flag which is modified in
> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
> because write/read are all inside dqm lock.
>
> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
> device_stopped flag before sending the updated runlist.
>
> v2: For no-HWS case, when device is stopped, don't call
> load/destroy_mqd for eviction, restore and create queue, and avoid
> debugfs dump hdqs.
>
> Backtrace of dqm lock deadlock:
>
> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
> than 120 seconds.
> [Thu Oct 17 16:43:37 2019]       Not tainted
> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
> [Thu Oct 17 16:43:37 2019] "echo 0 >
> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
> 0x80000000
> [Thu Oct 17 16:43:37 2019] Call Trace:
> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]
> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
> [amdgpu]
> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
> Signed-off-by: Philip Yang <Philip.Yang@amd.com>

Three more comments inline. With those comments addressed, this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>   4 files changed, 46 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index d9e36dbf13d5..40d75c39f08e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>   		return -EPERM;
>   	}
>   
> +	if (kfd_is_locked())
> +		return -EAGAIN;
> +
>   	process = kfd_create_process(filep);
>   	if (IS_ERR(process))
>   		return PTR_ERR(process);
>   
> -	if (kfd_is_locked())
> -		return -EAGAIN;
> -

Is this part of the change still needed? I remember that this sequence 
was a bit tricky with some potential race condition when Shaoyun was 
working on it. This may have unintended side effects.


>   	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>   		process->pasid, process->is_32bit_user_mode);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 8f4b24e84964..4fa8834ce7cb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   		return 0;
>   	kgd2kfd_suspend(kfd);
>   
> -	/* hold dqm->lock to prevent further execution*/
> -	dqm_lock(kfd->dqm);
> -
>   	kfd_signal_reset_event(kfd);
>   	return 0;
>   }
> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>   	if (!kfd->init_complete)
>   		return 0;
>   
> -	dqm_unlock(kfd->dqm);
> -
>   	ret = kfd_resume(kfd);
>   	if (ret)
>   		return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 81fb545cf42c..82e1c6280d13 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>   				&q->gart_mqd_addr, &q->properties);
>   	if (q->properties.is_active) {
> +		if (!dqm->sched_running) {
> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
> +			goto add_queue_to_list;
> +		}
>   
>   		if (WARN(q->process->mm != current->mm,
>   					"should only run in user thread"))
> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   			goto out_free_mqd;
>   	}
>   
> +add_queue_to_list:
>   	list_add(&q->list, &qpd->queues_list);
>   	qpd->queue_count++;
>   	if (q->properties.is_active)
> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   
>   	deallocate_doorbell(qpd, q);
>   
> +	if (!dqm->sched_running) {
> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
> +		return 0;
> +	}
> +
>   	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>   				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>   				KFD_UNMAP_LATENCY_MS,
> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>   		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>   		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> +
> +		if (!dqm->sched_running) {
> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
> +			goto out_unlock;
> +		}
> +
>   		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>   				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>   				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = false;
> +		dqm->queue_count--;
> +
> +		if (!dqm->sched_running)
> +			continue;

Add a WARN here as well. When the scheduler is not running the queues 
should already be evicted.


> +
>   		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>   				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>   				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>   			 * maintain a consistent eviction state
>   			 */
>   			ret = retval;
> -		dqm->queue_count--;
>   	}
>   
>   out:
> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		q->properties.is_active = true;
> +		dqm->queue_count++;
> +
> +		if (!dqm->sched_running)
> +			continue;

Add a WARN here as well. This should not happen while we're suspended.

Regards,
   Felix

> +
>   		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>   				       q->queue, &q->properties, mm);
>   		if (retval && !ret)
> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>   			 * maintain a consistent eviction state
>   			 */
>   			ret = retval;
> -		dqm->queue_count++;
>   	}
>   	qpd->evicted = 0;
>   out:
> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>   	
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		return pm_init(&dqm->packets, dqm);
> -	
> +	dqm->sched_running = true;
> +
>   	return 0;
>   }
>   
> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>   {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>   		pm_uninit(&dqm->packets);
> -	
> +	dqm->sched_running = false;
> +
>   	return 0;
>   }
>   
> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm_lock(dqm);
>   	/* clear hang status when driver try to start the hw scheduler */
>   	dqm->is_hws_hang = false;
> +	dqm->sched_running = true;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
>   
> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   {
>   	dqm_lock(dqm);
>   	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	dqm->sched_running = false;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>   {
>   	int retval;
>   
> +	if (!dqm->sched_running)
> +		return 0;
>   	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>   		return 0;
> -
>   	if (dqm->active_runlist)
>   		return 0;
>   
> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   {
>   	int retval = 0;
>   
> +	if (!dqm->sched_running)
> +		return 0;
>   	if (dqm->is_hws_hang)
>   		return -EIO;
>   	if (!dqm->active_runlist)
> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>   	int pipe, queue;
>   	int r = 0;
>   
> +	if (!dqm->sched_running) {
> +		seq_printf(m, " Device is stopped\n");
> +
> +		return 0;
> +	}
> +
>   	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>   					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>   					&dump, &n_regs);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 2eaea6b04cbe..a8c37e6da027 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -201,6 +201,7 @@ struct device_queue_manager {
>   	bool			is_hws_hang;
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
> +	bool			sched_running;
>   };
>   
>   void device_queue_manager_init_cik(
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]         ` <b7c3ad13-0b36-5fbb-5079-4a72a8dd853a-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22 19:19           ` Yang, Philip
       [not found]             ` <92e6ce2b-5a2c-62ef-9c03-51c3ea28be1e-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Yang, Philip @ 2019-10-22 19:19 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2019-10-22 2:40 p.m., Grodzovsky, Andrey wrote:
> 
> On 10/22/19 2:38 PM, Grodzovsky, Andrey wrote:
>> On 10/22/19 2:28 PM, Yang, Philip wrote:
>>> If device reset/suspend/resume failed for some reason, dqm lock is
>>> hold forever and this causes deadlock. Below is a kernel backtrace when
>>> application open kfd after suspend/resume failed.
>>>
>>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>>> post_reset, add dqm->device_stopped flag which is modified in
>>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
>>> because write/read are all inside dqm lock.
>>>
>>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>>> device_stopped flag before sending the updated runlist.
>>
>> Is there a chance of race condition here where dqm->device_stopped
>> returns true for some operation (e.g.map_queues_cpsch) but just as it
>> proceeds GPU reset starts  ?
>>
>> Andrey
> 
> 
> Correction -
> 
> dqm->device_stopped returns FALSE
> 
No race condition here, dqm->device_stopped is set to FALSE in 
kgd2kfd_post_reset -> dqm->ops.start(), which the last step of 
amdgpu_device_gpu_recover, so it's safe to do map_queue_cpsch.

Regards,
Philip

> Andrey
> 
>>
>>
>>> v2: For no-HWS case, when device is stopped, don't call
>>> load/destroy_mqd for eviction, restore and create queue, and avoid
>>> debugfs dump hdqs.
>>>
>>> Backtrace of dqm lock deadlock:
>>>
>>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>>> than 120 seconds.
>>> [Thu Oct 17 16:43:37 2019]       Not tainted
>>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
>>> [Thu Oct 17 16:43:37 2019] "echo 0 >
>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>>> 0x80000000
>>> [Thu Oct 17 16:43:37 2019] Call Trace:
>>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
>>> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
>>> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
>>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
>>> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
>>> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
>>> [amdgpu]
>>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>>> [amdgpu]
>>> [Thu Oct 17 16:43:37 2019]
>>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
>>> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>>> [amdgpu]
>>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
>>> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
>>> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
>>> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
>>> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
>>> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
>>> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
>>> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
>>> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
>>> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
>>> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>
>>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>>>     drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>>>     .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>>>     .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>>>     4 files changed, 46 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index d9e36dbf13d5..40d75c39f08e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>>     		return -EPERM;
>>>     	}
>>>     
>>> +	if (kfd_is_locked())
>>> +		return -EAGAIN;
>>> +
>>>     	process = kfd_create_process(filep);
>>>     	if (IS_ERR(process))
>>>     		return PTR_ERR(process);
>>>     
>>> -	if (kfd_is_locked())
>>> -		return -EAGAIN;
>>> -
>>>     	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>>     		process->pasid, process->is_32bit_user_mode);
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 8f4b24e84964..4fa8834ce7cb 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>>     		return 0;
>>>     	kgd2kfd_suspend(kfd);
>>>     
>>> -	/* hold dqm->lock to prevent further execution*/
>>> -	dqm_lock(kfd->dqm);
>>> -
>>>     	kfd_signal_reset_event(kfd);
>>>     	return 0;
>>>     }
>>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>>     	if (!kfd->init_complete)
>>>     		return 0;
>>>     
>>> -	dqm_unlock(kfd->dqm);
>>> -
>>>     	ret = kfd_resume(kfd);
>>>     	if (ret)
>>>     		return ret;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index 81fb545cf42c..82e1c6280d13 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>     	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>>>     				&q->gart_mqd_addr, &q->properties);
>>>     	if (q->properties.is_active) {
>>> +		if (!dqm->sched_running) {
>>> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
>>> +			goto add_queue_to_list;
>>> +		}
>>>     
>>>     		if (WARN(q->process->mm != current->mm,
>>>     					"should only run in user thread"))
>>> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>     			goto out_free_mqd;
>>>     	}
>>>     
>>> +add_queue_to_list:
>>>     	list_add(&q->list, &qpd->queues_list);
>>>     	qpd->queue_count++;
>>>     	if (q->properties.is_active)
>>> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>>     
>>>     	deallocate_doorbell(qpd, q);
>>>     
>>> +	if (!dqm->sched_running) {
>>> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
>>> +		return 0;
>>> +	}
>>> +
>>>     	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>     				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>     				KFD_UNMAP_LATENCY_MS,
>>> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>>>     		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>>>     		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>     		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>> +
>>> +		if (!dqm->sched_running) {
>>> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
>>> +			goto out_unlock;
>>> +		}
>>> +
>>>     		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>     				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>     				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>     		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>     				q->properties.type)];
>>>     		q->properties.is_active = false;
>>> +		dqm->queue_count--;
>>> +
>>> +		if (!dqm->sched_running)
>>> +			continue;
>>> +
>>>     		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>     				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>     				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>     			 * maintain a consistent eviction state
>>>     			 */
>>>     			ret = retval;
>>> -		dqm->queue_count--;
>>>     	}
>>>     
>>>     out:
>>> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>     		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>     				q->properties.type)];
>>>     		q->properties.is_active = true;
>>> +		dqm->queue_count++;
>>> +
>>> +		if (!dqm->sched_running)
>>> +			continue;
>>> +
>>>     		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>>>     				       q->queue, &q->properties, mm);
>>>     		if (retval && !ret)
>>> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>     			 * maintain a consistent eviction state
>>>     			 */
>>>     			ret = retval;
>>> -		dqm->queue_count++;
>>>     	}
>>>     	qpd->evicted = 0;
>>>     out:
>>> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>>>     	
>>>     	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>     		return pm_init(&dqm->packets, dqm);
>>> -	
>>> +	dqm->sched_running = true;
>>> +
>>>     	return 0;
>>>     }
>>>     
>>> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>>     {
>>>     	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>     		pm_uninit(&dqm->packets);
>>> -	
>>> +	dqm->sched_running = false;
>>> +
>>>     	return 0;
>>>     }
>>>     
>>> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>>     	dqm_lock(dqm);
>>>     	/* clear hang status when driver try to start the hw scheduler */
>>>     	dqm->is_hws_hang = false;
>>> +	dqm->sched_running = true;
>>>     	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>>     	dqm_unlock(dqm);
>>>     
>>> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>>     {
>>>     	dqm_lock(dqm);
>>>     	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>> +	dqm->sched_running = false;
>>>     	dqm_unlock(dqm);
>>>     
>>>     	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>>>     {
>>>     	int retval;
>>>     
>>> +	if (!dqm->sched_running)
>>> +		return 0;
>>>     	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>>     		return 0;
>>> -
>>>     	if (dqm->active_runlist)
>>>     		return 0;
>>>     
>>> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>>     {
>>>     	int retval = 0;
>>>     
>>> +	if (!dqm->sched_running)
>>> +		return 0;
>>>     	if (dqm->is_hws_hang)
>>>     		return -EIO;
>>>     	if (!dqm->active_runlist)
>>> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>>>     	int pipe, queue;
>>>     	int r = 0;
>>>     
>>> +	if (!dqm->sched_running) {
>>> +		seq_printf(m, " Device is stopped\n");
>>> +
>>> +		return 0;
>>> +	}
>>> +
>>>     	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>>>     					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>>>     					&dump, &n_regs);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>> index 2eaea6b04cbe..a8c37e6da027 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>>     	bool			is_hws_hang;
>>>     	struct work_struct	hw_exception_work;
>>>     	struct kfd_mem_obj	hiq_sdma_mqd;
>>> +	bool			sched_running;
>>>     };
>>>     
>>>     void device_queue_manager_init_cik(
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]             ` <92e6ce2b-5a2c-62ef-9c03-51c3ea28be1e-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22 19:36               ` Grodzovsky, Andrey
       [not found]                 ` <b51add2e-ded9-0de0-962b-96cf458340be-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-22 19:36 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


On 10/22/19 3:19 PM, Yang, Philip wrote:
>
> On 2019-10-22 2:40 p.m., Grodzovsky, Andrey wrote:
>> On 10/22/19 2:38 PM, Grodzovsky, Andrey wrote:
>>> On 10/22/19 2:28 PM, Yang, Philip wrote:
>>>> If device reset/suspend/resume failed for some reason, dqm lock is
>>>> hold forever and this causes deadlock. Below is a kernel backtrace when
>>>> application open kfd after suspend/resume failed.
>>>>
>>>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>>>> post_reset, add dqm->device_stopped flag which is modified in
>>>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
>>>> because write/read are all inside dqm lock.
>>>>
>>>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>>>> device_stopped flag before sending the updated runlist.
>>> Is there a chance of race condition here where dqm->device_stopped
>>> returns true for some operation (e.g.map_queues_cpsch) but just as it
>>> proceeds GPU reset starts  ?
>>>
>>> Andrey
>>
>> Correction -
>>
>> dqm->device_stopped returns FALSE
>>
> No race condition here, dqm->device_stopped is set to FALSE in
> kgd2kfd_post_reset -> dqm->ops.start(), which the last step of
> amdgpu_device_gpu_recover, so it's safe to do map_queue_cpsch.
>
> Regards,
> Philip

Sorry - i was confused by the commit description vs. body - in the 
description it's called device_stopped flag while in the body it's  
sched_running - probably the description needs to be fixed.

So i mean the switch true->false when GPU reset just begins - you get an 
IOCTL to map a queue (if i understood KFD code correctly), check  
dqm->sched_running == true and continue, what if right then GPU reset 
started due to some issue like job timeout or user triggered reset - 
dqm->sched_running becomes false but you already past that checkpoint in 
the IOCTL, no ?

Andrey

>
>> Andrey
>>
>>>
>>>> v2: For no-HWS case, when device is stopped, don't call
>>>> load/destroy_mqd for eviction, restore and create queue, and avoid
>>>> debugfs dump hdqs.
>>>>
>>>> Backtrace of dqm lock deadlock:
>>>>
>>>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>>>> than 120 seconds.
>>>> [Thu Oct 17 16:43:37 2019]       Not tainted
>>>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
>>>> [Thu Oct 17 16:43:37 2019] "echo 0 >
>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>>>> 0x80000000
>>>> [Thu Oct 17 16:43:37 2019] Call Trace:
>>>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
>>>> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
>>>> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
>>>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
>>>> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
>>>> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
>>>> [amdgpu]
>>>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>>>> [amdgpu]
>>>> [Thu Oct 17 16:43:37 2019]
>>>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
>>>> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>>>> [amdgpu]
>>>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
>>>> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
>>>> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
>>>> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
>>>> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
>>>> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
>>>> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
>>>> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
>>>> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
>>>> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
>>>> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>>>>      drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>>>>      .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>>>>      .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>>>>      4 files changed, 46 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> index d9e36dbf13d5..40d75c39f08e 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>>>      		return -EPERM;
>>>>      	}
>>>>      
>>>> +	if (kfd_is_locked())
>>>> +		return -EAGAIN;
>>>> +
>>>>      	process = kfd_create_process(filep);
>>>>      	if (IS_ERR(process))
>>>>      		return PTR_ERR(process);
>>>>      
>>>> -	if (kfd_is_locked())
>>>> -		return -EAGAIN;
>>>> -
>>>>      	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>>>      		process->pasid, process->is_32bit_user_mode);
>>>>      
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>> index 8f4b24e84964..4fa8834ce7cb 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>>>      		return 0;
>>>>      	kgd2kfd_suspend(kfd);
>>>>      
>>>> -	/* hold dqm->lock to prevent further execution*/
>>>> -	dqm_lock(kfd->dqm);
>>>> -
>>>>      	kfd_signal_reset_event(kfd);
>>>>      	return 0;
>>>>      }
>>>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>>>      	if (!kfd->init_complete)
>>>>      		return 0;
>>>>      
>>>> -	dqm_unlock(kfd->dqm);
>>>> -
>>>>      	ret = kfd_resume(kfd);
>>>>      	if (ret)
>>>>      		return ret;
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> index 81fb545cf42c..82e1c6280d13 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>>      	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>>>>      				&q->gart_mqd_addr, &q->properties);
>>>>      	if (q->properties.is_active) {
>>>> +		if (!dqm->sched_running) {
>>>> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
>>>> +			goto add_queue_to_list;
>>>> +		}
>>>>      
>>>>      		if (WARN(q->process->mm != current->mm,
>>>>      					"should only run in user thread"))
>>>> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>>      			goto out_free_mqd;
>>>>      	}
>>>>      
>>>> +add_queue_to_list:
>>>>      	list_add(&q->list, &qpd->queues_list);
>>>>      	qpd->queue_count++;
>>>>      	if (q->properties.is_active)
>>>> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>>>      
>>>>      	deallocate_doorbell(qpd, q);
>>>>      
>>>> +	if (!dqm->sched_running) {
>>>> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
>>>> +		return 0;
>>>> +	}
>>>> +
>>>>      	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>      				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>>      				KFD_UNMAP_LATENCY_MS,
>>>> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>>>>      		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>>>>      		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>      		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>>> +
>>>> +		if (!dqm->sched_running) {
>>>> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
>>>> +			goto out_unlock;
>>>> +		}
>>>> +
>>>>      		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>      				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>>      				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>>> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>      		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>>      				q->properties.type)];
>>>>      		q->properties.is_active = false;
>>>> +		dqm->queue_count--;
>>>> +
>>>> +		if (!dqm->sched_running)
>>>> +			continue;
>>>> +
>>>>      		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>      				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>>      				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>>> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>      			 * maintain a consistent eviction state
>>>>      			 */
>>>>      			ret = retval;
>>>> -		dqm->queue_count--;
>>>>      	}
>>>>      
>>>>      out:
>>>> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>      		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>>      				q->properties.type)];
>>>>      		q->properties.is_active = true;
>>>> +		dqm->queue_count++;
>>>> +
>>>> +		if (!dqm->sched_running)
>>>> +			continue;
>>>> +
>>>>      		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>>>>      				       q->queue, &q->properties, mm);
>>>>      		if (retval && !ret)
>>>> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>      			 * maintain a consistent eviction state
>>>>      			 */
>>>>      			ret = retval;
>>>> -		dqm->queue_count++;
>>>>      	}
>>>>      	qpd->evicted = 0;
>>>>      out:
>>>> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>>>>      	
>>>>      	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>      		return pm_init(&dqm->packets, dqm);
>>>> -	
>>>> +	dqm->sched_running = true;
>>>> +
>>>>      	return 0;
>>>>      }
>>>>      
>>>> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>>>      {
>>>>      	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>      		pm_uninit(&dqm->packets);
>>>> -	
>>>> +	dqm->sched_running = false;
>>>> +
>>>>      	return 0;
>>>>      }
>>>>      
>>>> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>>>      	dqm_lock(dqm);
>>>>      	/* clear hang status when driver try to start the hw scheduler */
>>>>      	dqm->is_hws_hang = false;
>>>> +	dqm->sched_running = true;
>>>>      	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>>>      	dqm_unlock(dqm);
>>>>      
>>>> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>>>      {
>>>>      	dqm_lock(dqm);
>>>>      	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>> +	dqm->sched_running = false;
>>>>      	dqm_unlock(dqm);
>>>>      
>>>>      	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>>> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>>>>      {
>>>>      	int retval;
>>>>      
>>>> +	if (!dqm->sched_running)
>>>> +		return 0;
>>>>      	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>>>      		return 0;
>>>> -
>>>>      	if (dqm->active_runlist)
>>>>      		return 0;
>>>>      
>>>> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>>>      {
>>>>      	int retval = 0;
>>>>      
>>>> +	if (!dqm->sched_running)
>>>> +		return 0;
>>>>      	if (dqm->is_hws_hang)
>>>>      		return -EIO;
>>>>      	if (!dqm->active_runlist)
>>>> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>>>>      	int pipe, queue;
>>>>      	int r = 0;
>>>>      
>>>> +	if (!dqm->sched_running) {
>>>> +		seq_printf(m, " Device is stopped\n");
>>>> +
>>>> +		return 0;
>>>> +	}
>>>> +
>>>>      	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>>>>      					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>>>>      					&dump, &n_regs);
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>> index 2eaea6b04cbe..a8c37e6da027 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>>>      	bool			is_hws_hang;
>>>>      	struct work_struct	hw_exception_work;
>>>>      	struct kfd_mem_obj	hiq_sdma_mqd;
>>>> +	bool			sched_running;
>>>>      };
>>>>      
>>>>      void device_queue_manager_init_cik(
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]     ` <558b2dd3-8095-81e0-547b-788b74c9b329-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22 19:52       ` Yang, Philip
  0 siblings, 0 replies; 9+ messages in thread
From: Yang, Philip @ 2019-10-22 19:52 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 2019-10-22 2:44 p.m., Kuehling, Felix wrote:
> On 2019-10-22 14:28, Yang, Philip wrote:
>> If device reset/suspend/resume failed for some reason, dqm lock is
>> hold forever and this causes deadlock. Below is a kernel backtrace when
>> application open kfd after suspend/resume failed.
>>
>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>> post_reset, add dqm->device_stopped flag which is modified in
>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
>> because write/read are all inside dqm lock.
>>
>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>> device_stopped flag before sending the updated runlist.
>>
>> v2: For no-HWS case, when device is stopped, don't call
>> load/destroy_mqd for eviction, restore and create queue, and avoid
>> debugfs dump hdqs.
>>
>> Backtrace of dqm lock deadlock:
>>
>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>> than 120 seconds.
>> [Thu Oct 17 16:43:37 2019]       Not tainted
>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
>> [Thu Oct 17 16:43:37 2019] "echo 0 >
>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>> 0x80000000
>> [Thu Oct 17 16:43:37 2019] Call Trace:
>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
>> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
>> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
>> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
>> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]
>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>> [amdgpu]
>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
>> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
>> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
>> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
>> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
>> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
>> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
>> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
>> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
>> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
>> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>
>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
> 
> Three more comments inline. With those comments addressed, this patch is
> 
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> 
> 
>> ---
>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>>    .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>>    .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>>    4 files changed, 46 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index d9e36dbf13d5..40d75c39f08e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>    		return -EPERM;
>>    	}
>>    
>> +	if (kfd_is_locked())
>> +		return -EAGAIN;
>> +
>>    	process = kfd_create_process(filep);
>>    	if (IS_ERR(process))
>>    		return PTR_ERR(process);
>>    
>> -	if (kfd_is_locked())
>> -		return -EAGAIN;
>> -
> 
> Is this part of the change still needed? I remember that this sequence
> was a bit tricky with some potential race condition when Shaoyun was
> working on it. This may have unintended side effects.
> 
> 
Revert this change to be safe, it is not needed and not clear if there 
are side effects.

>>    	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>    		process->pasid, process->is_32bit_user_mode);
>>    
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index 8f4b24e84964..4fa8834ce7cb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>    		return 0;
>>    	kgd2kfd_suspend(kfd);
>>    
>> -	/* hold dqm->lock to prevent further execution*/
>> -	dqm_lock(kfd->dqm);
>> -
>>    	kfd_signal_reset_event(kfd);
>>    	return 0;
>>    }
>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>    	if (!kfd->init_complete)
>>    		return 0;
>>    
>> -	dqm_unlock(kfd->dqm);
>> -
>>    	ret = kfd_resume(kfd);
>>    	if (ret)
>>    		return ret;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 81fb545cf42c..82e1c6280d13 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>    	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>>    				&q->gart_mqd_addr, &q->properties);
>>    	if (q->properties.is_active) {
>> +		if (!dqm->sched_running) {
>> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
>> +			goto add_queue_to_list;
>> +		}
>>    
>>    		if (WARN(q->process->mm != current->mm,
>>    					"should only run in user thread"))
>> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>    			goto out_free_mqd;
>>    	}
>>    
>> +add_queue_to_list:
>>    	list_add(&q->list, &qpd->queues_list);
>>    	qpd->queue_count++;
>>    	if (q->properties.is_active)
>> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>    
>>    	deallocate_doorbell(qpd, q);
>>    
>> +	if (!dqm->sched_running) {
>> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
>> +		return 0;
>> +	}
>> +
>>    	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>    				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>    				KFD_UNMAP_LATENCY_MS,
>> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>>    		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>>    		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>    		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>> +
>> +		if (!dqm->sched_running) {
>> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
>> +			goto out_unlock;
>> +		}
>> +
>>    		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>    				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>    				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>    				q->properties.type)];
>>    		q->properties.is_active = false;
>> +		dqm->queue_count--;
>> +
>> +		if (!dqm->sched_running)
>> +			continue;
> 
> Add a WARN here as well. When the scheduler is not running the queues
> should already be evicted.
>
Done

> 
>> +
>>    		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>    				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>    				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    			 * maintain a consistent eviction state
>>    			 */
>>    			ret = retval;
>> -		dqm->queue_count--;
>>    	}
>>    
>>    out:
>> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>    				q->properties.type)];
>>    		q->properties.is_active = true;
>> +		dqm->queue_count++;
>> +
>> +		if (!dqm->sched_running)
>> +			continue;
> 
> Add a WARN here as well. This should not happen while we're suspended.
> 
Done.

> Regards,
>     Felix
> 
>> +
>>    		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>>    				       q->queue, &q->properties, mm);
>>    		if (retval && !ret)
>> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>    			 * maintain a consistent eviction state
>>    			 */
>>    			ret = retval;
>> -		dqm->queue_count++;
>>    	}
>>    	qpd->evicted = 0;
>>    out:
>> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>>    	
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		return pm_init(&dqm->packets, dqm);
>> -	
>> +	dqm->sched_running = true;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>    {
>>    	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>    		pm_uninit(&dqm->packets);
>> -	
>> +	dqm->sched_running = false;
>> +
>>    	return 0;
>>    }
>>    
>> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>    	dqm_lock(dqm);
>>    	/* clear hang status when driver try to start the hw scheduler */
>>    	dqm->is_hws_hang = false;
>> +	dqm->sched_running = true;
>>    	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>    	dqm_unlock(dqm);
>>    
>> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	dqm_lock(dqm);
>>    	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +	dqm->sched_running = false;
>>    	dqm_unlock(dqm);
>>    
>>    	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>>    {
>>    	int retval;
>>    
>> +	if (!dqm->sched_running)
>> +		return 0;
>>    	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>    		return 0;
>> -
>>    	if (dqm->active_runlist)
>>    		return 0;
>>    
>> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>    {
>>    	int retval = 0;
>>    
>> +	if (!dqm->sched_running)
>> +		return 0;
>>    	if (dqm->is_hws_hang)
>>    		return -EIO;
>>    	if (!dqm->active_runlist)
>> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>>    	int pipe, queue;
>>    	int r = 0;
>>    
>> +	if (!dqm->sched_running) {
>> +		seq_printf(m, " Device is stopped\n");
>> +
>> +		return 0;
>> +	}
>> +
>>    	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>>    					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>>    					&dump, &n_regs);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> index 2eaea6b04cbe..a8c37e6da027 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>    	bool			is_hws_hang;
>>    	struct work_struct	hw_exception_work;
>>    	struct kfd_mem_obj	hiq_sdma_mqd;
>> +	bool			sched_running;
>>    };
>>    
>>    void device_queue_manager_init_cik(
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]                 ` <b51add2e-ded9-0de0-962b-96cf458340be-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22 20:04                   ` Yang, Philip
       [not found]                     ` <a400a3cf-b8e2-18d7-7460-14c304a9c7d4-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Yang, Philip @ 2019-10-22 20:04 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2019-10-22 3:36 p.m., Grodzovsky, Andrey wrote:
> 
> On 10/22/19 3:19 PM, Yang, Philip wrote:
>>
>> On 2019-10-22 2:40 p.m., Grodzovsky, Andrey wrote:
>>> On 10/22/19 2:38 PM, Grodzovsky, Andrey wrote:
>>>> On 10/22/19 2:28 PM, Yang, Philip wrote:
>>>>> If device reset/suspend/resume failed for some reason, dqm lock is
>>>>> hold forever and this causes deadlock. Below is a kernel backtrace when
>>>>> application open kfd after suspend/resume failed.
>>>>>
>>>>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>>>>> post_reset, add dqm->device_stopped flag which is modified in
>>>>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
>>>>> because write/read are all inside dqm lock.
>>>>>
>>>>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>>>>> device_stopped flag before sending the updated runlist.
>>>> Is there a chance of race condition here where dqm->device_stopped
>>>> returns true for some operation (e.g.map_queues_cpsch) but just as it
>>>> proceeds GPU reset starts  ?
>>>>
>>>> Andrey
>>>
>>> Correction -
>>>
>>> dqm->device_stopped returns FALSE
>>>
>> No race condition here, dqm->device_stopped is set to FALSE in
>> kgd2kfd_post_reset -> dqm->ops.start(), which the last step of
>> amdgpu_device_gpu_recover, so it's safe to do map_queue_cpsch.
>>
>> Regards,
>> Philip
> 
> Sorry - i was confused by the commit description vs. body - in the
> description it's called device_stopped flag while in the body it's
> sched_running - probably the description needs to be fixed.
> 
Thanks, commit description is updated.

> So i mean the switch true->false when GPU reset just begins - you get an
> IOCTL to map a queue (if i understood KFD code correctly), check
> dqm->sched_running == true and continue, what if right then GPU reset
> started due to some issue like job timeout or user triggered reset -
> dqm->sched_running becomes false but you already past that checkpoint in
> the IOCTL, no ?
>
map a queue is done under dqm lock, to send the updated runlist to HWS, 
then relase dqm lock. GPU reset start pre_reset will unmap all queues 
first under dqm lock, then set dqm->sched_running to false, release dqm 
lock, and then continue to reset HW. dqm lock guarantee the two 
operations are serialized.

Philip

> Andrey
> 
>>
>>> Andrey
>>>
>>>>
>>>>> v2: For no-HWS case, when device is stopped, don't call
>>>>> load/destroy_mqd for eviction, restore and create queue, and avoid
>>>>> debugfs dump hdqs.
>>>>>
>>>>> Backtrace of dqm lock deadlock:
>>>>>
>>>>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>>>>> than 120 seconds.
>>>>> [Thu Oct 17 16:43:37 2019]       Not tainted
>>>>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
>>>>> [Thu Oct 17 16:43:37 2019] "echo 0 >
>>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>>>>> 0x80000000
>>>>> [Thu Oct 17 16:43:37 2019] Call Trace:
>>>>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
>>>>> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
>>>>> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
>>>>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
>>>>> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
>>>>> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
>>>>> [amdgpu]
>>>>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>>>>> [amdgpu]
>>>>> [Thu Oct 17 16:43:37 2019]
>>>>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
>>>>> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>>>>> [amdgpu]
>>>>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
>>>>> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
>>>>> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
>>>>> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
>>>>> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
>>>>> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
>>>>> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
>>>>> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
>>>>> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
>>>>> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
>>>>> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>>
>>>>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>>>>> ---
>>>>>       drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>>>>>       drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>>>>>       .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>>>>>       .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>>>>>       4 files changed, 46 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> index d9e36dbf13d5..40d75c39f08e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>>>>       		return -EPERM;
>>>>>       	}
>>>>>       
>>>>> +	if (kfd_is_locked())
>>>>> +		return -EAGAIN;
>>>>> +
>>>>>       	process = kfd_create_process(filep);
>>>>>       	if (IS_ERR(process))
>>>>>       		return PTR_ERR(process);
>>>>>       
>>>>> -	if (kfd_is_locked())
>>>>> -		return -EAGAIN;
>>>>> -
>>>>>       	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>>>>       		process->pasid, process->is_32bit_user_mode);
>>>>>       
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>> index 8f4b24e84964..4fa8834ce7cb 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>>>>       		return 0;
>>>>>       	kgd2kfd_suspend(kfd);
>>>>>       
>>>>> -	/* hold dqm->lock to prevent further execution*/
>>>>> -	dqm_lock(kfd->dqm);
>>>>> -
>>>>>       	kfd_signal_reset_event(kfd);
>>>>>       	return 0;
>>>>>       }
>>>>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>>>>       	if (!kfd->init_complete)
>>>>>       		return 0;
>>>>>       
>>>>> -	dqm_unlock(kfd->dqm);
>>>>> -
>>>>>       	ret = kfd_resume(kfd);
>>>>>       	if (ret)
>>>>>       		return ret;
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> index 81fb545cf42c..82e1c6280d13 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>>>       	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>>>>>       				&q->gart_mqd_addr, &q->properties);
>>>>>       	if (q->properties.is_active) {
>>>>> +		if (!dqm->sched_running) {
>>>>> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
>>>>> +			goto add_queue_to_list;
>>>>> +		}
>>>>>       
>>>>>       		if (WARN(q->process->mm != current->mm,
>>>>>       					"should only run in user thread"))
>>>>> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>>>       			goto out_free_mqd;
>>>>>       	}
>>>>>       
>>>>> +add_queue_to_list:
>>>>>       	list_add(&q->list, &qpd->queues_list);
>>>>>       	qpd->queue_count++;
>>>>>       	if (q->properties.is_active)
>>>>> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>>>>       
>>>>>       	deallocate_doorbell(qpd, q);
>>>>>       
>>>>> +	if (!dqm->sched_running) {
>>>>> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
>>>>> +		return 0;
>>>>> +	}
>>>>> +
>>>>>       	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>>       				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>>>       				KFD_UNMAP_LATENCY_MS,
>>>>> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>>>>>       		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>>>>>       		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>       		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>>>> +
>>>>> +		if (!dqm->sched_running) {
>>>>> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
>>>>> +			goto out_unlock;
>>>>> +		}
>>>>> +
>>>>>       		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>>       				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>>>       				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>>>> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>       		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>>>       				q->properties.type)];
>>>>>       		q->properties.is_active = false;
>>>>> +		dqm->queue_count--;
>>>>> +
>>>>> +		if (!dqm->sched_running)
>>>>> +			continue;
>>>>> +
>>>>>       		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>>       				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>>>       				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>>>> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>       			 * maintain a consistent eviction state
>>>>>       			 */
>>>>>       			ret = retval;
>>>>> -		dqm->queue_count--;
>>>>>       	}
>>>>>       
>>>>>       out:
>>>>> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>       		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>>>       				q->properties.type)];
>>>>>       		q->properties.is_active = true;
>>>>> +		dqm->queue_count++;
>>>>> +
>>>>> +		if (!dqm->sched_running)
>>>>> +			continue;
>>>>> +
>>>>>       		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>>>>>       				       q->queue, &q->properties, mm);
>>>>>       		if (retval && !ret)
>>>>> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>       			 * maintain a consistent eviction state
>>>>>       			 */
>>>>>       			ret = retval;
>>>>> -		dqm->queue_count++;
>>>>>       	}
>>>>>       	qpd->evicted = 0;
>>>>>       out:
>>>>> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>>>>>       	
>>>>>       	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>>       		return pm_init(&dqm->packets, dqm);
>>>>> -	
>>>>> +	dqm->sched_running = true;
>>>>> +
>>>>>       	return 0;
>>>>>       }
>>>>>       
>>>>> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>>>>       {
>>>>>       	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>>       		pm_uninit(&dqm->packets);
>>>>> -	
>>>>> +	dqm->sched_running = false;
>>>>> +
>>>>>       	return 0;
>>>>>       }
>>>>>       
>>>>> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>>>>       	dqm_lock(dqm);
>>>>>       	/* clear hang status when driver try to start the hw scheduler */
>>>>>       	dqm->is_hws_hang = false;
>>>>> +	dqm->sched_running = true;
>>>>>       	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>>>>       	dqm_unlock(dqm);
>>>>>       
>>>>> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>>>>       {
>>>>>       	dqm_lock(dqm);
>>>>>       	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>>> +	dqm->sched_running = false;
>>>>>       	dqm_unlock(dqm);
>>>>>       
>>>>>       	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>>>> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>>>>>       {
>>>>>       	int retval;
>>>>>       
>>>>> +	if (!dqm->sched_running)
>>>>> +		return 0;
>>>>>       	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>>>>       		return 0;
>>>>> -
>>>>>       	if (dqm->active_runlist)
>>>>>       		return 0;
>>>>>       
>>>>> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>>>>       {
>>>>>       	int retval = 0;
>>>>>       
>>>>> +	if (!dqm->sched_running)
>>>>> +		return 0;
>>>>>       	if (dqm->is_hws_hang)
>>>>>       		return -EIO;
>>>>>       	if (!dqm->active_runlist)
>>>>> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>>>>>       	int pipe, queue;
>>>>>       	int r = 0;
>>>>>       
>>>>> +	if (!dqm->sched_running) {
>>>>> +		seq_printf(m, " Device is stopped\n");
>>>>> +
>>>>> +		return 0;
>>>>> +	}
>>>>> +
>>>>>       	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>>>>>       					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>>>>>       					&dump, &n_regs);
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>> index 2eaea6b04cbe..a8c37e6da027 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>>>>       	bool			is_hws_hang;
>>>>>       	struct work_struct	hw_exception_work;
>>>>>       	struct kfd_mem_obj	hiq_sdma_mqd;
>>>>> +	bool			sched_running;
>>>>>       };
>>>>>       
>>>>>       void device_queue_manager_init_cik(
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume
       [not found]                     ` <a400a3cf-b8e2-18d7-7460-14c304a9c7d4-5C7GfCeVMHo@public.gmane.org>
@ 2019-10-22 20:19                       ` Grodzovsky, Andrey
  0 siblings, 0 replies; 9+ messages in thread
From: Grodzovsky, Andrey @ 2019-10-22 20:19 UTC (permalink / raw)
  To: Yang, Philip, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


On 10/22/19 4:04 PM, Yang, Philip wrote:
>
> On 2019-10-22 3:36 p.m., Grodzovsky, Andrey wrote:
>> On 10/22/19 3:19 PM, Yang, Philip wrote:
>>> On 2019-10-22 2:40 p.m., Grodzovsky, Andrey wrote:
>>>> On 10/22/19 2:38 PM, Grodzovsky, Andrey wrote:
>>>>> On 10/22/19 2:28 PM, Yang, Philip wrote:
>>>>>> If device reset/suspend/resume failed for some reason, dqm lock is
>>>>>> hold forever and this causes deadlock. Below is a kernel backtrace when
>>>>>> application open kfd after suspend/resume failed.
>>>>>>
>>>>>> Instead of holding dqm lock in pre_reset and releasing dqm lock in
>>>>>> post_reset, add dqm->device_stopped flag which is modified in
>>>>>> dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
>>>>>> because write/read are all inside dqm lock.
>>>>>>
>>>>>> For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
>>>>>> device_stopped flag before sending the updated runlist.
>>>>> Is there a chance of race condition here where dqm->device_stopped
>>>>> returns true for some operation (e.g.map_queues_cpsch) but just as it
>>>>> proceeds GPU reset starts  ?
>>>>>
>>>>> Andrey
>>>> Correction -
>>>>
>>>> dqm->device_stopped returns FALSE
>>>>
>>> No race condition here, dqm->device_stopped is set to FALSE in
>>> kgd2kfd_post_reset -> dqm->ops.start(), which the last step of
>>> amdgpu_device_gpu_recover, so it's safe to do map_queue_cpsch.
>>>
>>> Regards,
>>> Philip
>> Sorry - i was confused by the commit description vs. body - in the
>> description it's called device_stopped flag while in the body it's
>> sched_running - probably the description needs to be fixed.
>>
> Thanks, commit description is updated.
>
>> So i mean the switch true->false when GPU reset just begins - you get an
>> IOCTL to map a queue (if i understood KFD code correctly), check
>> dqm->sched_running == true and continue, what if right then GPU reset
>> started due to some issue like job timeout or user triggered reset -
>> dqm->sched_running becomes false but you already past that checkpoint in
>> the IOCTL, no ?
>>
> map a queue is done under dqm lock, to send the updated runlist to HWS,
> then relase dqm lock. GPU reset start pre_reset will unmap all queues
> first under dqm lock, then set dqm->sched_running to false, release dqm
> lock, and then continue to reset HW. dqm lock guarantee the two
> operations are serialized.
>
> Philip


I see now that it's ok.

Andrey


>
>> Andrey
>>
>>>> Andrey
>>>>
>>>>>> v2: For no-HWS case, when device is stopped, don't call
>>>>>> load/destroy_mqd for eviction, restore and create queue, and avoid
>>>>>> debugfs dump hdqs.
>>>>>>
>>>>>> Backtrace of dqm lock deadlock:
>>>>>>
>>>>>> [Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
>>>>>> than 120 seconds.
>>>>>> [Thu Oct 17 16:43:37 2019]       Not tainted
>>>>>> 5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
>>>>>> [Thu Oct 17 16:43:37 2019] "echo 0 >
>>>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>>> [Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
>>>>>> 0x80000000
>>>>>> [Thu Oct 17 16:43:37 2019] Call Trace:
>>>>>> [Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0
>>>>>> [Thu Oct 17 16:43:37 2019]  schedule+0x32/0x70
>>>>>> [Thu Oct 17 16:43:37 2019]  schedule_preempt_disabled+0xa/0x10
>>>>>> [Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0
>>>>>> [Thu Oct 17 16:43:37 2019]  ? __call_srcu+0x264/0x3b0
>>>>>> [Thu Oct 17 16:43:37 2019]  ? process_termination_cpsch+0x24/0x2f0
>>>>>> [amdgpu]
>>>>>> [Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
>>>>>> [amdgpu]
>>>>>> [Thu Oct 17 16:43:37 2019]
>>>>>> kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
>>>>>> [Thu Oct 17 16:43:37 2019]  kfd_process_notifier_release+0x1be/0x220
>>>>>> [amdgpu]
>>>>>> [Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0
>>>>>> [Thu Oct 17 16:43:37 2019]  exit_mmap+0x160/0x1a0
>>>>>> [Thu Oct 17 16:43:37 2019]  ? __handle_mm_fault+0xba3/0x1200
>>>>>> [Thu Oct 17 16:43:37 2019]  ? exit_robust_list+0x5a/0x110
>>>>>> [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120
>>>>>> [Thu Oct 17 16:43:37 2019]  do_exit+0x284/0xb20
>>>>>> [Thu Oct 17 16:43:37 2019]  ? handle_mm_fault+0xfa/0x200
>>>>>> [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0
>>>>>> [Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20
>>>>>> [Thu Oct 17 16:43:37 2019]  do_syscall_64+0x4f/0x100
>>>>>> [Thu Oct 17 16:43:37 2019]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>>>
>>>>>> Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>>> Signed-off-by: Philip Yang <Philip.Yang@amd.com>
>>>>>> ---
>>>>>>        drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  6 +--
>>>>>>        drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  5 --
>>>>>>        .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
>>>>>>        .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
>>>>>>        4 files changed, 46 insertions(+), 13 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> index d9e36dbf13d5..40d75c39f08e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> @@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
>>>>>>        		return -EPERM;
>>>>>>        	}
>>>>>>        
>>>>>> +	if (kfd_is_locked())
>>>>>> +		return -EAGAIN;
>>>>>> +
>>>>>>        	process = kfd_create_process(filep);
>>>>>>        	if (IS_ERR(process))
>>>>>>        		return PTR_ERR(process);
>>>>>>        
>>>>>> -	if (kfd_is_locked())
>>>>>> -		return -EAGAIN;
>>>>>> -
>>>>>>        	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
>>>>>>        		process->pasid, process->is_32bit_user_mode);
>>>>>>        
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>>> index 8f4b24e84964..4fa8834ce7cb 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>>>>> @@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>>>>>>        		return 0;
>>>>>>        	kgd2kfd_suspend(kfd);
>>>>>>        
>>>>>> -	/* hold dqm->lock to prevent further execution*/
>>>>>> -	dqm_lock(kfd->dqm);
>>>>>> -
>>>>>>        	kfd_signal_reset_event(kfd);
>>>>>>        	return 0;
>>>>>>        }
>>>>>> @@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>>>>>>        	if (!kfd->init_complete)
>>>>>>        		return 0;
>>>>>>        
>>>>>> -	dqm_unlock(kfd->dqm);
>>>>>> -
>>>>>>        	ret = kfd_resume(kfd);
>>>>>>        	if (ret)
>>>>>>        		return ret;
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>> index 81fb545cf42c..82e1c6280d13 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>>> @@ -340,6 +340,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>>>>        	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
>>>>>>        				&q->gart_mqd_addr, &q->properties);
>>>>>>        	if (q->properties.is_active) {
>>>>>> +		if (!dqm->sched_running) {
>>>>>> +			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
>>>>>> +			goto add_queue_to_list;
>>>>>> +		}
>>>>>>        
>>>>>>        		if (WARN(q->process->mm != current->mm,
>>>>>>        					"should only run in user thread"))
>>>>>> @@ -351,6 +355,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>>>>>>        			goto out_free_mqd;
>>>>>>        	}
>>>>>>        
>>>>>> +add_queue_to_list:
>>>>>>        	list_add(&q->list, &qpd->queues_list);
>>>>>>        	qpd->queue_count++;
>>>>>>        	if (q->properties.is_active)
>>>>>> @@ -458,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>>>>>        
>>>>>>        	deallocate_doorbell(qpd, q);
>>>>>>        
>>>>>> +	if (!dqm->sched_running) {
>>>>>> +		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
>>>>>> +		return 0;
>>>>>> +	}
>>>>>> +
>>>>>>        	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>>>        				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>>>>        				KFD_UNMAP_LATENCY_MS,
>>>>>> @@ -533,6 +543,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
>>>>>>        		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
>>>>>>        		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>>>>>        		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>>>>> +
>>>>>> +		if (!dqm->sched_running) {
>>>>>> +			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
>>>>>> +			goto out_unlock;
>>>>>> +		}
>>>>>> +
>>>>>>        		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>>>        				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>>>>        				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>>>>> @@ -602,6 +618,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>>        		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>>>>        				q->properties.type)];
>>>>>>        		q->properties.is_active = false;
>>>>>> +		dqm->queue_count--;
>>>>>> +
>>>>>> +		if (!dqm->sched_running)
>>>>>> +			continue;
>>>>>> +
>>>>>>        		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>>>>>>        				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
>>>>>>        				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
>>>>>> @@ -610,7 +631,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>>        			 * maintain a consistent eviction state
>>>>>>        			 */
>>>>>>        			ret = retval;
>>>>>> -		dqm->queue_count--;
>>>>>>        	}
>>>>>>        
>>>>>>        out:
>>>>>> @@ -711,6 +731,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>>        		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>>>>        				q->properties.type)];
>>>>>>        		q->properties.is_active = true;
>>>>>> +		dqm->queue_count++;
>>>>>> +
>>>>>> +		if (!dqm->sched_running)
>>>>>> +			continue;
>>>>>> +
>>>>>>        		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
>>>>>>        				       q->queue, &q->properties, mm);
>>>>>>        		if (retval && !ret)
>>>>>> @@ -718,7 +743,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>>>>>        			 * maintain a consistent eviction state
>>>>>>        			 */
>>>>>>        			ret = retval;
>>>>>> -		dqm->queue_count++;
>>>>>>        	}
>>>>>>        	qpd->evicted = 0;
>>>>>>        out:
>>>>>> @@ -915,7 +939,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>>>>>>        	
>>>>>>        	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>>>        		return pm_init(&dqm->packets, dqm);
>>>>>> -	
>>>>>> +	dqm->sched_running = true;
>>>>>> +
>>>>>>        	return 0;
>>>>>>        }
>>>>>>        
>>>>>> @@ -923,7 +948,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>>>>>>        {
>>>>>>        	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>>>        		pm_uninit(&dqm->packets);
>>>>>> -	
>>>>>> +	dqm->sched_running = false;
>>>>>> +
>>>>>>        	return 0;
>>>>>>        }
>>>>>>        
>>>>>> @@ -1074,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>>>>>>        	dqm_lock(dqm);
>>>>>>        	/* clear hang status when driver try to start the hw scheduler */
>>>>>>        	dqm->is_hws_hang = false;
>>>>>> +	dqm->sched_running = true;
>>>>>>        	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>>>>>        	dqm_unlock(dqm);
>>>>>>        
>>>>>> @@ -1089,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>>>>>>        {
>>>>>>        	dqm_lock(dqm);
>>>>>>        	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>>>> +	dqm->sched_running = false;
>>>>>>        	dqm_unlock(dqm);
>>>>>>        
>>>>>>        	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>>>>> @@ -1275,9 +1303,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
>>>>>>        {
>>>>>>        	int retval;
>>>>>>        
>>>>>> +	if (!dqm->sched_running)
>>>>>> +		return 0;
>>>>>>        	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
>>>>>>        		return 0;
>>>>>> -
>>>>>>        	if (dqm->active_runlist)
>>>>>>        		return 0;
>>>>>>        
>>>>>> @@ -1299,6 +1328,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>>>>>>        {
>>>>>>        	int retval = 0;
>>>>>>        
>>>>>> +	if (!dqm->sched_running)
>>>>>> +		return 0;
>>>>>>        	if (dqm->is_hws_hang)
>>>>>>        		return -EIO;
>>>>>>        	if (!dqm->active_runlist)
>>>>>> @@ -1903,6 +1934,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
>>>>>>        	int pipe, queue;
>>>>>>        	int r = 0;
>>>>>>        
>>>>>> +	if (!dqm->sched_running) {
>>>>>> +		seq_printf(m, " Device is stopped\n");
>>>>>> +
>>>>>> +		return 0;
>>>>>> +	}
>>>>>> +
>>>>>>        	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
>>>>>>        					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
>>>>>>        					&dump, &n_regs);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>> index 2eaea6b04cbe..a8c37e6da027 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
>>>>>> @@ -201,6 +201,7 @@ struct device_queue_manager {
>>>>>>        	bool			is_hws_hang;
>>>>>>        	struct work_struct	hw_exception_work;
>>>>>>        	struct kfd_mem_obj	hiq_sdma_mqd;
>>>>>> +	bool			sched_running;
>>>>>>        };
>>>>>>        
>>>>>>        void device_queue_manager_init_cik(
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-10-22 20:19 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-22 18:28 [PATCH v2] drm/amdkfd: don't use dqm lock during device reset/suspend/resume Yang, Philip
     [not found] ` <20191022182804.23845-1-Philip.Yang-5C7GfCeVMHo@public.gmane.org>
2019-10-22 18:38   ` Grodzovsky, Andrey
     [not found]     ` <83bfc853-cb0e-4285-d310-232683130751-5C7GfCeVMHo@public.gmane.org>
2019-10-22 18:40       ` Grodzovsky, Andrey
     [not found]         ` <b7c3ad13-0b36-5fbb-5079-4a72a8dd853a-5C7GfCeVMHo@public.gmane.org>
2019-10-22 19:19           ` Yang, Philip
     [not found]             ` <92e6ce2b-5a2c-62ef-9c03-51c3ea28be1e-5C7GfCeVMHo@public.gmane.org>
2019-10-22 19:36               ` Grodzovsky, Andrey
     [not found]                 ` <b51add2e-ded9-0de0-962b-96cf458340be-5C7GfCeVMHo@public.gmane.org>
2019-10-22 20:04                   ` Yang, Philip
     [not found]                     ` <a400a3cf-b8e2-18d7-7460-14c304a9c7d4-5C7GfCeVMHo@public.gmane.org>
2019-10-22 20:19                       ` Grodzovsky, Andrey
2019-10-22 18:44   ` Kuehling, Felix
     [not found]     ` <558b2dd3-8095-81e0-547b-788b74c9b329-5C7GfCeVMHo@public.gmane.org>
2019-10-22 19:52       ` Yang, Philip

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.