All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws
@ 2019-12-20  8:29 Felix Kuehling
  2019-12-20  8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20  8:29 UTC (permalink / raw)
  To: amd-gfx

Reading from /sys/kernel/debug/kfd/hang_hws would cause a kernel
oops because we didn't implement a read callback. Set the permission
to write-only to prevent that.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 15c523027285..511712c2e382 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -93,7 +93,7 @@ void kfd_debugfs_init(void)
 			    kfd_debugfs_hqds_by_device, &kfd_debugfs_fops);
 	debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
 			    kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
-	debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+	debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
 			    NULL, &kfd_debugfs_hang_hws_fops);
 }
 
-- 
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/4] drm/amdkfd: Remove unused variable
  2019-12-20  8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
@ 2019-12-20  8:29 ` Felix Kuehling
  2020-01-02 13:40   ` Russell, Kent
  2019-12-20  8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20  8:29 UTC (permalink / raw)
  To: amd-gfx

dqm->pipeline_mem wasn't used anywhere.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 -
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f7f6df40875e..558c0ad81848 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -930,7 +930,6 @@ static void uninitialize(struct device_queue_manager *dqm)
 	for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
 		kfree(dqm->mqd_mgrs[i]);
 	mutex_destroy(&dqm->lock_hidden);
-	kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
 }
 
 static int start_nocpsch(struct device_queue_manager *dqm)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a8c37e6da027..8991120c4fa2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -190,7 +190,6 @@ struct device_queue_manager {
 	/* the pasid mapping for each kfd vmid */
 	uint16_t		vmid_pasid[VMID_NUM];
 	uint64_t		pipelines_addr;
-	struct kfd_mem_obj	*pipeline_mem;
 	uint64_t		fence_gpu_addr;
 	unsigned int		*fence_addr;
 	struct kfd_mem_obj	*fence_mem;
-- 
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
  2019-12-20  8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
  2019-12-20  8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
@ 2019-12-20  8:30 ` Felix Kuehling
  2019-12-20 17:18   ` Zeng, Oak
  2019-12-20  8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
  2020-01-02 13:39 ` [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Russell, Kent
  3 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20  8:30 UTC (permalink / raw)
  To: amd-gfx

Move HWS hand detection into unmap_queues_cpsch to catch hangs in all
cases. If this happens during a reset, don't schedule another reset
because the reset already in progress is expected to take care of it.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  3 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++-----
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 ++
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c6b6901bbda3..2a9e40131735 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 {
 	if (!kfd->init_complete)
 		return 0;
+
+	kfd->dqm->ops.pre_reset(kfd->dqm);
+
 	kgd2kfd_suspend(kfd);
 
 	kfd_signal_reset_event(kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 558c0ad81848..a7e9ec1b3ce3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
 	return 0;
 }
 
+static void pre_reset(struct device_queue_manager *dqm)
+{
+	dqm_lock(dqm);
+	dqm->is_resetting = true;
+	dqm_unlock(dqm);
+}
+
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q)
 {
@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm_lock(dqm);
 	/* clear hang status when driver try to start the hw scheduler */
 	dqm->is_hws_hang = false;
+	dqm->is_resetting = false;
 	dqm->sched_running = true;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	/* should be timed out */
 	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
 				queue_preemption_timeout_ms);
-	if (retval)
+	if (retval) {
+		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+		dqm->is_hws_hang = true;
+		/* It's possible we're detecting a HWS hang in the
+		 * middle of a GPU reset. No need to schedule another
+		 * reset in this case.
+		 */
+		if (!dqm->is_resetting)
+			schedule_work(&dqm->hw_exception_work);
 		return retval;
+	}
 
 	pm_release_ib(&dqm->packets);
 	dqm->active_runlist = false;
@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 	if (dqm->is_hws_hang)
 		return -EIO;
 	retval = unmap_queues_cpsch(dqm, filter, filter_param);
-	if (retval) {
-		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
-		dqm->is_hws_hang = true;
-		schedule_work(&dqm->hw_exception_work);
+	if (retval)
 		return retval;
-	}
 
 	return map_queues_cpsch(dqm);
 }
@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.initialize = initialize_cpsch;
 		dqm->ops.start = start_cpsch;
 		dqm->ops.stop = stop_cpsch;
+		dqm->ops.pre_reset = pre_reset;
 		dqm->ops.destroy_queue = destroy_queue_cpsch;
 		dqm->ops.update_queue = update_queue;
 		dqm->ops.register_process = register_process;
@@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		/* initialize dqm for no cp scheduling */
 		dqm->ops.start = start_nocpsch;
 		dqm->ops.stop = stop_nocpsch;
+		dqm->ops.pre_reset = pre_reset;
 		dqm->ops.create_queue = create_queue_nocpsch;
 		dqm->ops.destroy_queue = destroy_queue_nocpsch;
 		dqm->ops.update_queue = update_queue;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 8991120c4fa2..871d3b628d2d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -104,6 +104,7 @@ struct device_queue_manager_ops {
 	int	(*initialize)(struct device_queue_manager *dqm);
 	int	(*start)(struct device_queue_manager *dqm);
 	int	(*stop)(struct device_queue_manager *dqm);
+	void	(*pre_reset)(struct device_queue_manager *dqm);
 	void	(*uninitialize)(struct device_queue_manager *dqm);
 	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
 					struct kernel_queue *kq,
@@ -198,6 +199,7 @@ struct device_queue_manager {
 
 	/* hw exception  */
 	bool			is_hws_hang;
+	bool			is_resetting;
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
 	bool			sched_running;
-- 
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20  8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
  2019-12-20  8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
  2019-12-20  8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
@ 2019-12-20  8:30 ` Felix Kuehling
  2019-12-20 10:03   ` Deng, Emily
                     ` (2 more replies)
  2020-01-02 13:39 ` [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Russell, Kent
  3 siblings, 3 replies; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20  8:30 UTC (permalink / raw)
  To: amd-gfx

Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
 static int stop_nocpsch(struct device_queue_manager *dqm)
 {
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-		pm_uninit(&dqm->packets);
+		pm_uninit(&dqm->packets, false);
 	dqm->sched_running = false;
 
 	return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	return 0;
 fail_allocate_vidmem:
 fail_set_sched_resources:
-	pm_uninit(&dqm->packets);
+	pm_uninit(&dqm->packets, false);
 fail_packet_manager_init:
 	return retval;
 }
 
 static int stop_cpsch(struct device_queue_manager *dqm)
 {
+	bool hanging;
+
 	dqm_lock(dqm);
-	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	if (!dqm->is_hws_hang)
+		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	hanging = dqm->is_hws_hang || dqm->is_resetting;
 	dqm->sched_running = false;
 	dqm_unlock(dqm);
 
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-	pm_uninit(&dqm->packets);
+	pm_uninit(&dqm->packets, hanging);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2d56dc534459..bae706462f96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
 }
 
 /* Uninitialize a kernel queue and free all its memory usages. */
-static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
 {
-	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
 		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
 					kq->queue->mqd,
 					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 	return NULL;
 }
 
-void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
 {
-	kq_uninitialize(kq);
+	kq_uninitialize(kq, hanging);
 	kfree(kq);
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 6cabed06ef5d..dc406e6dee23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
 	return 0;
 }
 
-void pm_uninit(struct packet_manager *pm)
+void pm_uninit(struct packet_manager *pm, bool hanging)
 {
 	mutex_destroy(&pm->lock);
-	kernel_queue_uninit(pm->priv_queue);
+	kernel_queue_uninit(pm->priv_queue, hanging);
 }
 
 int pm_send_set_resources(struct packet_manager *pm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 087e96838997..8ac680dc90f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
 void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type);
-void kernel_queue_uninit(struct kernel_queue *kq);
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
 int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
 
 /* Process Queue Manager */
@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
 extern const struct packet_manager_funcs kfd_v9_pm_funcs;
 
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
-void pm_uninit(struct packet_manager *pm);
+void pm_uninit(struct packet_manager *pm, bool hanging);
 int pm_send_set_resources(struct packet_manager *pm,
 				struct scheduling_resources *res);
 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index d3eacf72e8db..8fa856e6a03f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		/* destroy kernel queue (DIQ) */
 		dqm = pqn->kq->dev->dqm;
 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
-		kernel_queue_uninit(pqn->kq);
+		kernel_queue_uninit(pqn->kq, false);
 	}
 
 	if (pqn->q) {
-- 
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20  8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
@ 2019-12-20 10:03   ` Deng, Emily
  2019-12-20 15:56   ` shaoyunl
  2019-12-20 17:22   ` Zeng, Oak
  2 siblings, 0 replies; 17+ messages in thread
From: Deng, Emily @ 2019-12-20 10:03 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

Series Tested-by:  Emily Deng <Emily.Deng@amd.com> on sriov environment with vege10 about TDR-1, TDR-2 and TDR-3 test cases.

Best wishes
Emily Deng



>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix
>Kuehling
>Sent: Friday, December 20, 2019 4:30 PM
>To: amd-gfx@lists.freedesktop.org
>Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
>Don't use the HWS if it's known to be hanging. In a reset also don't try to
>destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
>Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
> 5 files changed, 17 insertions(+), 13 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>index a7e9ec1b3ce3..d7eb6ac37f62 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager
>*dqm)  static int stop_nocpsch(struct device_queue_manager *dqm)  {
> 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>-		pm_uninit(&dqm->packets);
>+		pm_uninit(&dqm->packets, false);
> 	dqm->sched_running = false;
>
> 	return 0;
>@@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>device_queue_manager *dqm)
> 	return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
>-	pm_uninit(&dqm->packets);
>+	pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> 	return retval;
> }
>
> static int stop_cpsch(struct device_queue_manager *dqm)  {
>+	bool hanging;
>+
> 	dqm_lock(dqm);
>-	unmap_queues_cpsch(dqm,
>KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>+	if (!dqm->is_hws_hang)
>+		unmap_queues_cpsch(dqm,
>KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>+	hanging = dqm->is_hws_hang || dqm->is_resetting;
> 	dqm->sched_running = false;
> 	dqm_unlock(dqm);
>
> 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>-	pm_uninit(&dqm->packets);
>+	pm_uninit(&dqm->packets, hanging);
>
> 	return 0;
> }
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>index 2d56dc534459..bae706462f96 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct
>kfd_dev *dev,  }
>
> /* Uninitialize a kernel queue and free all its memory usages. */ -static void
>kq_uninitialize(struct kernel_queue *kq)
>+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
> {
>-	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>+	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
> 		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> 					kq->queue->mqd,
>
>	KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev
>*dev,
> 	return NULL;
> }
>
>-void kernel_queue_uninit(struct kernel_queue *kq)
>+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
> {
>-	kq_uninitialize(kq);
>+	kq_uninitialize(kq, hanging);
> 	kfree(kq);
> }
>
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>index 6cabed06ef5d..dc406e6dee23 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
>device_queue_manager *dqm)
> 	return 0;
> }
>
>-void pm_uninit(struct packet_manager *pm)
>+void pm_uninit(struct packet_manager *pm, bool hanging)
> {
> 	mutex_destroy(&pm->lock);
>-	kernel_queue_uninit(pm->priv_queue);
>+	kernel_queue_uninit(pm->priv_queue, hanging);
> }
>
> int pm_send_set_resources(struct packet_manager *pm, diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>index 087e96838997..8ac680dc90f1 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>@@ -883,7 +883,7 @@ struct device_queue_manager
>*device_queue_manager_init(struct kfd_dev *dev);  void
>device_queue_manager_uninit(struct device_queue_manager *dqm);  struct
>kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> 					enum kfd_queue_type type);
>-void kernel_queue_uninit(struct kernel_queue *kq);
>+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int
>pasid);
>
> /* Process Queue Manager */
>@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>kfd_vi_pm_funcs;  extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
>-void pm_uninit(struct packet_manager *pm);
>+void pm_uninit(struct packet_manager *pm, bool hanging);
> int pm_send_set_resources(struct packet_manager *pm,
> 				struct scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm, struct list_head
>*dqm_queues); diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>index d3eacf72e8db..8fa856e6a03f 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>process_queue_manager *pqm, unsigned int qid)
> 		/* destroy kernel queue (DIQ) */
> 		dqm = pqn->kq->dev->dqm;
> 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>-		kernel_queue_uninit(pqn->kq);
>+		kernel_queue_uninit(pqn->kq, false);
> 	}
>
> 	if (pqn->q) {
>--
>2.24.1
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.fre
>edesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&amp;data=02%7C01%7CEmily.Deng%40amd.com%7C3c77bba4d40d4bc6b
>e8508d78526dd45%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C
>637124274794842900&amp;sdata=vHNAs2FTkSpHYZ2TTux%2F66attN4lf5qSiP
>jnlBOM5y0%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20  8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
  2019-12-20 10:03   ` Deng, Emily
@ 2019-12-20 15:56   ` shaoyunl
  2019-12-20 16:33     ` Felix Kuehling
  2019-12-20 17:22   ` Zeng, Oak
  2 siblings, 1 reply; 17+ messages in thread
From: shaoyunl @ 2019-12-20 15:56 UTC (permalink / raw)
  To: amd-gfx

Looks like patch 2 is not related to this serial , but anyway .

Patch 1,2,3 are reviewed by shaoyunl  <shaoyun.liu@amd.com>

For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
dqm->is_resetting  inside function kq_uninitialize.  so we don't need 
other interface change .

I think even Inside that kq_uninitialize function , we still can get dqm 
as  kq->dev->dqm .


shaoyun.liu


On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
> Don't use the HWS if it's known to be hanging. In a reset also
> don't try to destroy the HIQ because that may hang on SRIOV if the
> KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>   5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>   static int stop_nocpsch(struct device_queue_manager *dqm)
>   {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> -		pm_uninit(&dqm->packets);
> +		pm_uninit(&dqm->packets, false);
>   	dqm->sched_running = false;
>   
>   	return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	return 0;
>   fail_allocate_vidmem:
>   fail_set_sched_resources:
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, false);
>   fail_packet_manager_init:
>   	return retval;
>   }
>   
>   static int stop_cpsch(struct device_queue_manager *dqm)
>   {
> +	bool hanging;
> +kq_uninitialize(
>
>   	dqm_lock(dqm);
> -	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	if (!dqm->is_hws_hang)
> +		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	hanging = dqm->is_hws_hang || dqm->is_resetting;
>   	dqm->sched_running = false;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, hanging);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
>   }
>   
>   /* Uninitialize a kernel queue and free all its memory usages. */
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>   {
> -	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> +	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>   		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>   					kq->queue->mqd,
>   					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   	return NULL;
>   }
>   
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>   {
> -	kq_uninitialize(kq);
> +	kq_uninitialize(kq, hanging);
>   	kfree(kq);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
>   	return 0;
>   }
>   
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
>   {
>   	mutex_destroy(&pm->lock);
> -	kernel_queue_uninit(pm->priv_queue);
> +	kernel_queue_uninit(pm->priv_queue, hanging);
>   }
>   
>   int pm_send_set_resources(struct packet_manager *pm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
>   void device_queue_manager_uninit(struct device_queue_manager *dqm);
>   struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   					enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
>   
>   /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
>   extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>   
>   int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
> -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
>   int pm_send_set_resources(struct packet_manager *pm,
>   				struct scheduling_resources *res);
>   int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   		/* destroy kernel queue (DIQ) */
>   		dqm = pqn->kq->dev->dqm;
>   		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> -		kernel_queue_uninit(pqn->kq);
> +		kernel_queue_uninit(pqn->kq, false);
>   	}
>   
>   	if (pqn->q) {
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20 15:56   ` shaoyunl
@ 2019-12-20 16:33     ` Felix Kuehling
  2019-12-20 19:31       ` shaoyunl
  0 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 16:33 UTC (permalink / raw)
  To: shaoyunl, amd-gfx

dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs 
outside that lock protection. Therefore I opted to pass in the hanging 
flag as a parameter. It also keeps the logic that decides all of that 
inside the device queue manager, which I think is cleaner.

I was trying to clean this up further by moving the pm_init/pm_uninit 
out of the start_cpsch/stop_cpsch sequence, but gave up on that idea 
when I found out that I can't create the kernel queue in the DQM 
initialize function because dev->dqm isn't initialized at that time yet.

Regards,
   Felix

On 2019-12-20 10:56, shaoyunl wrote:
> Looks like patch 2 is not related to this serial , but anyway .
>
> Patch 1,2,3 are reviewed by shaoyunl  <shaoyun.liu@amd.com>
>
> For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
> dqm->is_resetting  inside function kq_uninitialize.  so we don't need 
> other interface change .
>
> I think even Inside that kq_uninitialize function , we still can get 
> dqm as  kq->dev->dqm .
>
>
> shaoyun.liu
>
>
> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>> Don't use the HWS if it's known to be hanging. In a reset also
>> don't try to destroy the HIQ because that may hang on SRIOV if the
>> KIQ is unresponsive.
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>>   5 files changed, 17 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct 
>> device_queue_manager *dqm)
>>   static int stop_nocpsch(struct device_queue_manager *dqm)
>>   {
>>       if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>> -        pm_uninit(&dqm->packets);
>> +        pm_uninit(&dqm->packets, false);
>>       dqm->sched_running = false;
>>         return 0;
>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct 
>> device_queue_manager *dqm)
>>       return 0;
>>   fail_allocate_vidmem:
>>   fail_set_sched_resources:
>> -    pm_uninit(&dqm->packets);
>> +    pm_uninit(&dqm->packets, false);
>>   fail_packet_manager_init:
>>       return retval;
>>   }
>>     static int stop_cpsch(struct device_queue_manager *dqm)
>>   {
>> +    bool hanging;
>> +kq_uninitialize(
>>
>>       dqm_lock(dqm);
>> -    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +    if (!dqm->is_hws_hang)
>> +        unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +    hanging = dqm->is_hws_hang || dqm->is_resetting;
>>       dqm->sched_running = false;
>>       dqm_unlock(dqm);
>>         kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>> -    pm_uninit(&dqm->packets);
>> +    pm_uninit(&dqm->packets, hanging);
>>         return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> index 2d56dc534459..bae706462f96 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue 
>> *kq, struct kfd_dev *dev,
>>   }
>>     /* Uninitialize a kernel queue and free all its memory usages. */
>> -static void kq_uninitialize(struct kernel_queue *kq)
>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>   {
>> -    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>> +    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>>           kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>                       kq->queue->mqd,
>>                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct 
>> kfd_dev *dev,
>>       return NULL;
>>   }
>>   -void kernel_queue_uninit(struct kernel_queue *kq)
>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>   {
>> -    kq_uninitialize(kq);
>> +    kq_uninitialize(kq, hanging);
>>       kfree(kq);
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> index 6cabed06ef5d..dc406e6dee23 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct 
>> device_queue_manager *dqm)
>>       return 0;
>>   }
>>   -void pm_uninit(struct packet_manager *pm)
>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>   {
>>       mutex_destroy(&pm->lock);
>> -    kernel_queue_uninit(pm->priv_queue);
>> +    kernel_queue_uninit(pm->priv_queue, hanging);
>>   }
>>     int pm_send_set_resources(struct packet_manager *pm,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 087e96838997..8ac680dc90f1 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -883,7 +883,7 @@ struct device_queue_manager 
>> *device_queue_manager_init(struct kfd_dev *dev);
>>   void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>   struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>                       enum kfd_queue_type type);
>> -void kernel_queue_uninit(struct kernel_queue *kq);
>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned 
>> int pasid);
>>     /* Process Queue Manager */
>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
>> kfd_vi_pm_funcs;
>>   extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>     int pm_init(struct packet_manager *pm, struct 
>> device_queue_manager *dqm);
>> -void pm_uninit(struct packet_manager *pm);
>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>   int pm_send_set_resources(struct packet_manager *pm,
>>                   struct scheduling_resources *res);
>>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
>> *dqm_queues);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index d3eacf72e8db..8fa856e6a03f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct 
>> process_queue_manager *pqm, unsigned int qid)
>>           /* destroy kernel queue (DIQ) */
>>           dqm = pqn->kq->dev->dqm;
>>           dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>> -        kernel_queue_uninit(pqn->kq);
>> +        kernel_queue_uninit(pqn->kq, false);
>>       }
>>         if (pqn->q) {
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&amp;sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&amp;reserved=0 
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
  2019-12-20  8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
@ 2019-12-20 17:18   ` Zeng, Oak
  2019-12-20 17:49     ` Felix Kuehling
  0 siblings, 1 reply; 17+ messages in thread
From: Zeng, Oak @ 2019-12-20 17:18 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

With this improvement, it is still possible that two reset be scheduled. There is a period of time after HWS hang and before kfd pre-reset is called, during which, if a thread already passed the is_hws_hang check but was scheduled out, then it will also schedule another reset. The whole sequence is:

Thread 1: call unmap_queues_cpsch, pass the is_hws_hang, scheduled out before sending unmap command to HWS
Thread 2: send unmap to HWS ->hang, schedule a reset
Thread1: before the reset worker thread is run(resetting is still false), thread1 continus, another reset is scheduled.


Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling

Move HWS hand detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  3 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++-----  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 ++
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c6b6901bbda3..2a9e40131735 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
 	if (!kfd->init_complete)
 		return 0;
+
+	kfd->dqm->ops.pre_reset(kfd->dqm);
+
 	kgd2kfd_suspend(kfd);
 
 	kfd_signal_reset_event(kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 558c0ad81848..a7e9ec1b3ce3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
 	return 0;
 }
 
+static void pre_reset(struct device_queue_manager *dqm) {
+	dqm_lock(dqm);
+	dqm->is_resetting = true;
+	dqm_unlock(dqm);
+}
+
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q)
 {
@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm_lock(dqm);
 	/* clear hang status when driver try to start the hw scheduler */
 	dqm->is_hws_hang = false;
+	dqm->is_resetting = false;
 	dqm->sched_running = true;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	/* should be timed out */
 	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
 				queue_preemption_timeout_ms);
-	if (retval)
+	if (retval) {
+		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+		dqm->is_hws_hang = true;
+		/* It's possible we're detecting a HWS hang in the
+		 * middle of a GPU reset. No need to schedule another
+		 * reset in this case.
+		 */
+		if (!dqm->is_resetting)
+			schedule_work(&dqm->hw_exception_work);
 		return retval;
+	}
 
 	pm_release_ib(&dqm->packets);
 	dqm->active_runlist = false;
@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 	if (dqm->is_hws_hang)
 		return -EIO;
 	retval = unmap_queues_cpsch(dqm, filter, filter_param);
-	if (retval) {
-		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
-		dqm->is_hws_hang = true;
-		schedule_work(&dqm->hw_exception_work);
+	if (retval)
 		return retval;
-	}
 
 	return map_queues_cpsch(dqm);
 }
@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.initialize = initialize_cpsch;
 		dqm->ops.start = start_cpsch;
 		dqm->ops.stop = stop_cpsch;
+		dqm->ops.pre_reset = pre_reset;
 		dqm->ops.destroy_queue = destroy_queue_cpsch;
 		dqm->ops.update_queue = update_queue;
 		dqm->ops.register_process = register_process; @@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		/* initialize dqm for no cp scheduling */
 		dqm->ops.start = start_nocpsch;
 		dqm->ops.stop = stop_nocpsch;
+		dqm->ops.pre_reset = pre_reset;
 		dqm->ops.create_queue = create_queue_nocpsch;
 		dqm->ops.destroy_queue = destroy_queue_nocpsch;
 		dqm->ops.update_queue = update_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 8991120c4fa2..871d3b628d2d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -104,6 +104,7 @@ struct device_queue_manager_ops {
 	int	(*initialize)(struct device_queue_manager *dqm);
 	int	(*start)(struct device_queue_manager *dqm);
 	int	(*stop)(struct device_queue_manager *dqm);
+	void	(*pre_reset)(struct device_queue_manager *dqm);
 	void	(*uninitialize)(struct device_queue_manager *dqm);
 	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
 					struct kernel_queue *kq,
@@ -198,6 +199,7 @@ struct device_queue_manager {
 
 	/* hw exception  */
 	bool			is_hws_hang;
+	bool			is_resetting;
 	struct work_struct	hw_exception_work;
 	struct kfd_mem_obj	hiq_sdma_mqd;
 	bool			sched_running;
--
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Coak.zeng%40amd.com%7Ca59ace5396cb46ed384708d78526df99%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274434007022&amp;sdata=g%2B57MBWpTbFzbchp6%2Bi2dfmUzYXlsf77InUj3R1XaaY%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20  8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
  2019-12-20 10:03   ` Deng, Emily
  2019-12-20 15:56   ` shaoyunl
@ 2019-12-20 17:22   ` Zeng, Oak
  2019-12-20 17:27     ` Felix Kuehling
  2 siblings, 1 reply; 17+ messages in thread
From: Zeng, Oak @ 2019-12-20 17:22 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[AMD Official Use Only - Internal Distribution Only]



Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

Don't use the HWS if it's known to be hanging. In a reset also don't try to destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)  static int stop_nocpsch(struct device_queue_manager *dqm)  {
 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-		pm_uninit(&dqm->packets);
+		pm_uninit(&dqm->packets, false);
 	dqm->sched_running = false;
 
 	return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	return 0;
 fail_allocate_vidmem:
 fail_set_sched_resources:
-	pm_uninit(&dqm->packets);
+	pm_uninit(&dqm->packets, false);
 fail_packet_manager_init:
 	return retval;
 }
 
 static int stop_cpsch(struct device_queue_manager *dqm)  {
+	bool hanging;
+
 	dqm_lock(dqm);
-	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	if (!dqm->is_hws_hang)
+		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	hanging = dqm->is_hws_hang || dqm->is_resetting;
[Oak] I don't think dqm->is_resetting is necessary. If is_resetting is true, is_hws_hang is always true. Those two flags are always the same except a period during which hws hang is detected but kfd_pre_reset is not called. In this period, hang is true but resetting is false, so "||resetting" doesn't help. 

Also see my comment on the 3rd patch.

	dqm->sched_running = false;
 	dqm_unlock(dqm);
 
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-	pm_uninit(&dqm->packets);
+	pm_uninit(&dqm->packets, hanging);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2d56dc534459..bae706462f96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,  }
 
 /* Uninitialize a kernel queue and free all its memory usages. */ -static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
 {
-	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
 		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
 					kq->queue->mqd,
 					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 	return NULL;
 }
 
-void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
 {
-	kq_uninitialize(kq);
+	kq_uninitialize(kq, hanging);
 	kfree(kq);
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 6cabed06ef5d..dc406e6dee23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
 	return 0;
 }
 
-void pm_uninit(struct packet_manager *pm)
+void pm_uninit(struct packet_manager *pm, bool hanging)
 {
 	mutex_destroy(&pm->lock);
-	kernel_queue_uninit(pm->priv_queue);
+	kernel_queue_uninit(pm->priv_queue, hanging);
 }
 
 int pm_send_set_resources(struct packet_manager *pm, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 087e96838997..8ac680dc90f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);  void device_queue_manager_uninit(struct device_queue_manager *dqm);  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type);
-void kernel_queue_uninit(struct kernel_queue *kq);
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
 int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
 
 /* Process Queue Manager */
@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;  extern const struct packet_manager_funcs kfd_v9_pm_funcs;
 
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -void pm_uninit(struct packet_manager *pm);
+void pm_uninit(struct packet_manager *pm, bool hanging);
 int pm_send_set_resources(struct packet_manager *pm,
 				struct scheduling_resources *res);
 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index d3eacf72e8db..8fa856e6a03f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		/* destroy kernel queue (DIQ) */
 		dqm = pqn->kq->dev->dqm;
 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
-		kernel_queue_uninit(pqn->kq);
+		kernel_queue_uninit(pqn->kq, false);
 	}
 
 	if (pqn->q) {
--
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Coak.zeng%40amd.com%7C7602eef96b0545baac8608d78526dde0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274407587881&amp;sdata=BlPgWgLi%2FrtkcPQLu%2FbK0dOrvg6qm4IsGVfuoUo%2B%2B1g%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20 17:22   ` Zeng, Oak
@ 2019-12-20 17:27     ` Felix Kuehling
  2019-12-20 17:29       ` Zeng, Oak
  0 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 17:27 UTC (permalink / raw)
  To: Zeng, Oak, amd-gfx

On 2019-12-20 12:22, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: amd-gfx@lists.freedesktop.org
> Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
> Don't use the HWS if it's known to be hanging. In a reset also don't try to destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>   5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)  static int stop_nocpsch(struct device_queue_manager *dqm)  {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> -		pm_uninit(&dqm->packets);
> +		pm_uninit(&dqm->packets, false);
>   	dqm->sched_running = false;
>   
>   	return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	return 0;
>   fail_allocate_vidmem:
>   fail_set_sched_resources:
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, false);
>   fail_packet_manager_init:
>   	return retval;
>   }
>   
>   static int stop_cpsch(struct device_queue_manager *dqm)  {
> +	bool hanging;
> +
>   	dqm_lock(dqm);
> -	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	if (!dqm->is_hws_hang)
> +		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	hanging = dqm->is_hws_hang || dqm->is_resetting;
> [Oak] I don't think dqm->is_resetting is necessary. If is_resetting is true, is_hws_hang is always true. Those two flags are always the same except a period during which hws hang is detected but kfd_pre_reset is not called. In this period, hang is true but resetting is false, so "||resetting" doesn't help.

This is not necessarily true. A GPU reset can be caused by amdgpu for 
example when the graphics engine is hanging. In that case HWS isn't 
necessarily hanging. I added "|| resetting" here to avoid touching 
hardware in an unknown state in pm_uninit=>kq_uninitialize in this case.

Regards,
   Felix

>
> Also see my comment on the 3rd patch.
>
> 	dqm->sched_running = false;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, hanging);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,  }
>   
>   /* Uninitialize a kernel queue and free all its memory usages. */ -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>   {
> -	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> +	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>   		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>   					kq->queue->mqd,
>   					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   	return NULL;
>   }
>   
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>   {
> -	kq_uninitialize(kq);
> +	kq_uninitialize(kq, hanging);
>   	kfree(kq);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
>   	return 0;
>   }
>   
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
>   {
>   	mutex_destroy(&pm->lock);
> -	kernel_queue_uninit(pm->priv_queue);
> +	kernel_queue_uninit(pm->priv_queue, hanging);
>   }
>   
>   int pm_send_set_resources(struct packet_manager *pm, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);  void device_queue_manager_uninit(struct device_queue_manager *dqm);  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   					enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
>   
>   /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;  extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>   
>   int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
>   int pm_send_set_resources(struct packet_manager *pm,
>   				struct scheduling_resources *res);
>   int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   		/* destroy kernel queue (DIQ) */
>   		dqm = pqn->kq->dev->dqm;
>   		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> -		kernel_queue_uninit(pqn->kq);
> +		kernel_queue_uninit(pqn->kq, false);
>   	}
>   
>   	if (pqn->q) {
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Coak.zeng%40amd.com%7C7602eef96b0545baac8608d78526dde0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274407587881&amp;sdata=BlPgWgLi%2FrtkcPQLu%2FbK0dOrvg6qm4IsGVfuoUo%2B%2B1g%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20 17:27     ` Felix Kuehling
@ 2019-12-20 17:29       ` Zeng, Oak
  0 siblings, 0 replies; 17+ messages in thread
From: Zeng, Oak @ 2019-12-20 17:29 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

I see. Thank you Felix for the explanation.

Regards,
Oak

-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@amd.com> 
Sent: Friday, December 20, 2019 12:28 PM
To: Zeng, Oak <Oak.Zeng@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

On 2019-12-20 12:22, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of 
> Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: amd-gfx@lists.freedesktop.org
> Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
> Don't use the HWS if it's known to be hanging. In a reset also don't try to destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>   5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)  static int stop_nocpsch(struct device_queue_manager *dqm)  {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> -		pm_uninit(&dqm->packets);
> +		pm_uninit(&dqm->packets, false);
>   	dqm->sched_running = false;
>   
>   	return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	return 0;
>   fail_allocate_vidmem:
>   fail_set_sched_resources:
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, false);
>   fail_packet_manager_init:
>   	return retval;
>   }
>   
>   static int stop_cpsch(struct device_queue_manager *dqm)  {
> +	bool hanging;
> +
>   	dqm_lock(dqm);
> -	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	if (!dqm->is_hws_hang)
> +		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	hanging = dqm->is_hws_hang || dqm->is_resetting;
> [Oak] I don't think dqm->is_resetting is necessary. If is_resetting is true, is_hws_hang is always true. Those two flags are always the same except a period during which hws hang is detected but kfd_pre_reset is not called. In this period, hang is true but resetting is false, so "||resetting" doesn't help.

This is not necessarily true. A GPU reset can be caused by amdgpu for example when the graphics engine is hanging. In that case HWS isn't necessarily hanging. I added "|| resetting" here to avoid touching hardware in an unknown state in pm_uninit=>kq_uninitialize in this case.

Regards,
   Felix

>
> Also see my comment on the 3rd patch.
>
> 	dqm->sched_running = false;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, hanging);
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, 
> struct kfd_dev *dev,  }
>   
>   /* Uninitialize a kernel queue and free all its memory usages. */ 
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>   {
> -	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> +	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>   		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>   					kq->queue->mqd,
>   					KFD_PREEMPT_TYPE_WAVEFRONT_RESET, @@ -337,9 +337,9 @@ struct 
> kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   	return NULL;
>   }
>   
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>   {
> -	kq_uninitialize(kq);
> +	kq_uninitialize(kq, hanging);
>   	kfree(kq);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
>   	return 0;
>   }
>   
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
>   {
>   	mutex_destroy(&pm->lock);
> -	kernel_queue_uninit(pm->priv_queue);
> +	kernel_queue_uninit(pm->priv_queue, hanging);
>   }
>   
>   int pm_send_set_resources(struct packet_manager *pm, diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);  void device_queue_manager_uninit(struct device_queue_manager *dqm);  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   					enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned 
> int pasid);
>   
>   /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
> kfd_vi_pm_funcs;  extern const struct packet_manager_funcs 
> kfd_v9_pm_funcs;
>   
>   int pm_init(struct packet_manager *pm, struct device_queue_manager 
> *dqm); -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
>   int pm_send_set_resources(struct packet_manager *pm,
>   				struct scheduling_resources *res);
>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
> *dqm_queues); diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   		/* destroy kernel queue (DIQ) */
>   		dqm = pqn->kq->dev->dqm;
>   		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> -		kernel_queue_uninit(pqn->kq);
> +		kernel_queue_uninit(pqn->kq, false);
>   	}
>   
>   	if (pqn->q) {
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Coa
> k.zeng%40amd.com%7C7602eef96b0545baac8608d78526dde0%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637124274407587881&amp;sdata=BlPgWgLi%2Frtk
> cPQLu%2FbK0dOrvg6qm4IsGVfuoUo%2B%2B1g%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
  2019-12-20 17:18   ` Zeng, Oak
@ 2019-12-20 17:49     ` Felix Kuehling
  0 siblings, 0 replies; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 17:49 UTC (permalink / raw)
  To: Zeng, Oak, amd-gfx

On 2019-12-20 12:18, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> With this improvement, it is still possible that two reset be scheduled. There is a period of time after HWS hang and before kfd pre-reset is called, during which, if a thread already passed the is_hws_hang check but was scheduled out, then it will also schedule another reset. The whole sequence is:
>
> Thread 1: call unmap_queues_cpsch, pass the is_hws_hang, scheduled out before sending unmap command to HWS
> Thread 2: send unmap to HWS ->hang, schedule a reset
> Thread1: before the reset worker thread is run(resetting is still false), thread1 continus, another reset is scheduled.

Rescheduling the reset worker "before the reset worker thread is run" 
results in the reset worker only running once. The work item can be on 
the queue twice at the same time. The more interesting case is if the 
reset worker is already running but hasn't called 
amdgpu_amdkfd_pre_reset yet. In that case we may end up scheduling a 
second reset. I can't think of a good way to prevent this race.

It gets more confusing when you consider that GPU resets can be 
triggered outside of KFD. So a reset can start outside a KFD reset 
worker thread and KFD can schedule another reset. I think the only place 
to really prevent this type of race would be in 
amdgpu_device_should_recover_gpu with some kind of reset decision flag 
protected by a lock.

I could also try to get rid of the worker thread for GPU resets in KFD. 
I think we created the worker to avoid locking issues, but there may be 
better ways to do this.

Regards,
   Felix

>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: amd-gfx@lists.freedesktop.org
> Subject: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
>
> Move HWS hand detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  3 +++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++-----  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 ++
>   3 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c6b6901bbda3..2a9e40131735 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
>   	if (!kfd->init_complete)
>   		return 0;
> +
> +	kfd->dqm->ops.pre_reset(kfd->dqm);
> +
>   	kgd2kfd_suspend(kfd);
>   
>   	kfd_signal_reset_event(kfd);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 558c0ad81848..a7e9ec1b3ce3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
>   	return 0;
>   }
>   
> +static void pre_reset(struct device_queue_manager *dqm) {
> +	dqm_lock(dqm);
> +	dqm->is_resetting = true;
> +	dqm_unlock(dqm);
> +}
> +
>   static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q)
>   {
> @@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	dqm_lock(dqm);
>   	/* clear hang status when driver try to start the hw scheduler */
>   	dqm->is_hws_hang = false;
> +	dqm->is_resetting = false;
>   	dqm->sched_running = true;
>   	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   	dqm_unlock(dqm);
> @@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   	/* should be timed out */
>   	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
>   				queue_preemption_timeout_ms);
> -	if (retval)
> +	if (retval) {
> +		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
> +		dqm->is_hws_hang = true;
> +		/* It's possible we're detecting a HWS hang in the
> +		 * middle of a GPU reset. No need to schedule another
> +		 * reset in this case.
> +		 */
> +		if (!dqm->is_resetting)
> +			schedule_work(&dqm->hw_exception_work);
>   		return retval;
> +	}
>   
>   	pm_release_ib(&dqm->packets);
>   	dqm->active_runlist = false;
> @@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   	if (dqm->is_hws_hang)
>   		return -EIO;
>   	retval = unmap_queues_cpsch(dqm, filter, filter_param);
> -	if (retval) {
> -		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
> -		dqm->is_hws_hang = true;
> -		schedule_work(&dqm->hw_exception_work);
> +	if (retval)
>   		return retval;
> -	}
>   
>   	return map_queues_cpsch(dqm);
>   }
> @@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>   		dqm->ops.initialize = initialize_cpsch;
>   		dqm->ops.start = start_cpsch;
>   		dqm->ops.stop = stop_cpsch;
> +		dqm->ops.pre_reset = pre_reset;
>   		dqm->ops.destroy_queue = destroy_queue_cpsch;
>   		dqm->ops.update_queue = update_queue;
>   		dqm->ops.register_process = register_process; @@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>   		/* initialize dqm for no cp scheduling */
>   		dqm->ops.start = start_nocpsch;
>   		dqm->ops.stop = stop_nocpsch;
> +		dqm->ops.pre_reset = pre_reset;
>   		dqm->ops.create_queue = create_queue_nocpsch;
>   		dqm->ops.destroy_queue = destroy_queue_nocpsch;
>   		dqm->ops.update_queue = update_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 8991120c4fa2..871d3b628d2d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -104,6 +104,7 @@ struct device_queue_manager_ops {
>   	int	(*initialize)(struct device_queue_manager *dqm);
>   	int	(*start)(struct device_queue_manager *dqm);
>   	int	(*stop)(struct device_queue_manager *dqm);
> +	void	(*pre_reset)(struct device_queue_manager *dqm);
>   	void	(*uninitialize)(struct device_queue_manager *dqm);
>   	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
>   					struct kernel_queue *kq,
> @@ -198,6 +199,7 @@ struct device_queue_manager {
>   
>   	/* hw exception  */
>   	bool			is_hws_hang;
> +	bool			is_resetting;
>   	struct work_struct	hw_exception_work;
>   	struct kfd_mem_obj	hiq_sdma_mqd;
>   	bool			sched_running;
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Coak.zeng%40amd.com%7Ca59ace5396cb46ed384708d78526df99%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274434007022&amp;sdata=g%2B57MBWpTbFzbchp6%2Bi2dfmUzYXlsf77InUj3R1XaaY%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20 16:33     ` Felix Kuehling
@ 2019-12-20 19:31       ` shaoyunl
  2019-12-20 19:46         ` Felix Kuehling
  0 siblings, 1 reply; 17+ messages in thread
From: shaoyunl @ 2019-12-20 19:31 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx

Can we use the  dqm_lock when we try to get the dqm->is_hw_hang and  
dqm->is_resetting inside function kq_uninitialize ?

I think more closer we check the status  to hqd_destroy it will be  more 
accurate . It does look better with this logic if the status are changed 
after dqm unmap_queue call and  before we call hqd_destroy .

Another comment in line.

Regards

shaoyun.liu




On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
> dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs 
> outside that lock protection. Therefore I opted to pass in the hanging 
> flag as a parameter. It also keeps the logic that decides all of that 
> inside the device queue manager, which I think is cleaner.
>
> I was trying to clean this up further by moving the pm_init/pm_uninit 
> out of the start_cpsch/stop_cpsch sequence, but gave up on that idea 
> when I found out that I can't create the kernel queue in the DQM 
> initialize function because dev->dqm isn't initialized at that time yet.
>
> Regards,
>   Felix
>
> On 2019-12-20 10:56, shaoyunl wrote:
>> Looks like patch 2 is not related to this serial , but anyway .
>>
>> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
>>
>> For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
>> dqm->is_resetting  inside function kq_uninitialize.  so we don't need 
>> other interface change .
>>
>> I think even Inside that kq_uninitialize function , we still can get 
>> dqm as  kq->dev->dqm .
>>
>>
>> shaoyun.liu
>>
>>
>> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>>> Don't use the HWS if it's known to be hanging. In a reset also
>>> don't try to destroy the HIQ because that may hang on SRIOV if the
>>> KIQ is unresponsive.
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 
>>> ++++++++----
>>>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>>>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>>>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>>>   5 files changed, 17 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct 
>>> device_queue_manager *dqm)
>>>   static int stop_nocpsch(struct device_queue_manager *dqm)
>>>   {
>>>       if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>> -        pm_uninit(&dqm->packets);
>>> +        pm_uninit(&dqm->packets, false);
>>>       dqm->sched_running = false;
>>>         return 0;
>>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct 
>>> device_queue_manager *dqm)
>>>       return 0;
>>>   fail_allocate_vidmem:
>>>   fail_set_sched_resources:
>>> -    pm_uninit(&dqm->packets);
>>> +    pm_uninit(&dqm->packets, false);
>>>   fail_packet_manager_init:
>>>       return retval;
>>>   }
>>>     static int stop_cpsch(struct device_queue_manager *dqm)
>>>   {
>>> +    bool hanging;
>>> +kq_uninitialize(
>>>
>>>       dqm_lock(dqm);
>>> -    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>> +    if (!dqm->is_hws_hang)
[shaoyunl]  should we check is_resetting here as well . so we ignore 
the  unmap call even HWS still not  detect the hang but we know we 
currently in resetting  precedure
>>> +        unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 
>>> 0);
>>> +    hanging = dqm->is_hws_hang || dqm->is_resetting;
>>>       dqm->sched_running = false;
>>>       dqm_unlock(dqm);
>>>         kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>> -    pm_uninit(&dqm->packets);
>>> +    pm_uninit(&dqm->packets, hanging);
>>>         return 0;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> index 2d56dc534459..bae706462f96 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue 
>>> *kq, struct kfd_dev *dev,
>>>   }
>>>     /* Uninitialize a kernel queue and free all its memory usages. */
>>> -static void kq_uninitialize(struct kernel_queue *kq)
>>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>>   {
>>> -    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>>> +    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>>>           kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>>                       kq->queue->mqd,
>>>                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct 
>>> kfd_dev *dev,
>>>       return NULL;
>>>   }
>>>   -void kernel_queue_uninit(struct kernel_queue *kq)
>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>>   {
>>> -    kq_uninitialize(kq);
>>> +    kq_uninitialize(kq, hanging);
>>>       kfree(kq);
>>>   }
>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> index 6cabed06ef5d..dc406e6dee23 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct 
>>> device_queue_manager *dqm)
>>>       return 0;
>>>   }
>>>   -void pm_uninit(struct packet_manager *pm)
>>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>>   {
>>>       mutex_destroy(&pm->lock);
>>> -    kernel_queue_uninit(pm->priv_queue);
>>> +    kernel_queue_uninit(pm->priv_queue, hanging);
>>>   }
>>>     int pm_send_set_resources(struct packet_manager *pm,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
>>> index 087e96838997..8ac680dc90f1 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -883,7 +883,7 @@ struct device_queue_manager 
>>> *device_queue_manager_init(struct kfd_dev *dev);
>>>   void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>>   struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>>                       enum kfd_queue_type type);
>>> -void kernel_queue_uninit(struct kernel_queue *kq);
>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>>   int kfd_process_vm_fault(struct device_queue_manager *dqm, 
>>> unsigned int pasid);
>>>     /* Process Queue Manager */
>>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
>>> kfd_vi_pm_funcs;
>>>   extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>>     int pm_init(struct packet_manager *pm, struct 
>>> device_queue_manager *dqm);
>>> -void pm_uninit(struct packet_manager *pm);
>>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>>   int pm_send_set_resources(struct packet_manager *pm,
>>>                   struct scheduling_resources *res);
>>>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
>>> *dqm_queues);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index d3eacf72e8db..8fa856e6a03f 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct 
>>> process_queue_manager *pqm, unsigned int qid)
>>>           /* destroy kernel queue (DIQ) */
>>>           dqm = pqn->kq->dev->dqm;
>>>           dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>> -        kernel_queue_uninit(pqn->kq);
>>> +        kernel_queue_uninit(pqn->kq, false);
>>>       }
>>>         if (pqn->q) {
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&amp;sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&amp;reserved=0 
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20 19:31       ` shaoyunl
@ 2019-12-20 19:46         ` Felix Kuehling
  2019-12-20 22:09           ` shaoyunl
  0 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 19:46 UTC (permalink / raw)
  To: shaoyunl, amd-gfx

On 2019-12-20 14:31, shaoyunl wrote:
> Can we use the  dqm_lock when we try to get the dqm->is_hw_hang and 
> dqm->is_resetting inside function kq_uninitialize ?

Spreading the DQM lock around is probably not a good idea. Then I'd 
rather do more refactoring to move hqd_load and hqd_destroy out of the 
kq_init/kq_uninit functions.


>
> I think more closer we check the status  to hqd_destroy it will be  
> more accurate . It does look better with this logic if the status are 
> changed after dqm unmap_queue call and  before we call hqd_destroy .
>
> Another comment in line.
>
> Regards
>
> shaoyun.liu
>
>
>
>
> On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
>> dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs 
>> outside that lock protection. Therefore I opted to pass in the 
>> hanging flag as a parameter. It also keeps the logic that decides all 
>> of that inside the device queue manager, which I think is cleaner.
>>
>> I was trying to clean this up further by moving the pm_init/pm_uninit 
>> out of the start_cpsch/stop_cpsch sequence, but gave up on that idea 
>> when I found out that I can't create the kernel queue in the DQM 
>> initialize function because dev->dqm isn't initialized at that time yet.
>>
>> Regards,
>>   Felix
>>
>> On 2019-12-20 10:56, shaoyunl wrote:
>>> Looks like patch 2 is not related to this serial , but anyway .
>>>
>>> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
>>>
>>> For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
>>> dqm->is_resetting  inside function kq_uninitialize.  so we don't 
>>> need other interface change .
>>>
>>> I think even Inside that kq_uninitialize function , we still can get 
>>> dqm as  kq->dev->dqm .
>>>
>>>
>>> shaoyun.liu
>>>
>>>
>>> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>>>> Don't use the HWS if it's known to be hanging. In a reset also
>>>> don't try to destroy the HIQ because that may hang on SRIOV if the
>>>> KIQ is unresponsive.
>>>>
>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>> ---
>>>>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 
>>>> ++++++++----
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>>>>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>>>>   5 files changed, 17 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct 
>>>> device_queue_manager *dqm)
>>>>   static int stop_nocpsch(struct device_queue_manager *dqm)
>>>>   {
>>>>       if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>> -        pm_uninit(&dqm->packets);
>>>> +        pm_uninit(&dqm->packets, false);
>>>>       dqm->sched_running = false;
>>>>         return 0;
>>>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct 
>>>> device_queue_manager *dqm)
>>>>       return 0;
>>>>   fail_allocate_vidmem:
>>>>   fail_set_sched_resources:
>>>> -    pm_uninit(&dqm->packets);
>>>> +    pm_uninit(&dqm->packets, false);
>>>>   fail_packet_manager_init:
>>>>       return retval;
>>>>   }
>>>>     static int stop_cpsch(struct device_queue_manager *dqm)
>>>>   {
>>>> +    bool hanging;
>>>> +kq_uninitialize(
>>>>
>>>>       dqm_lock(dqm);
>>>> -    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>> +    if (!dqm->is_hws_hang)
> [shaoyunl]  should we check is_resetting here as well . so we ignore 
> the  unmap call even HWS still not  detect the hang but we know we 
> currently in resetting  precedure

GPU reset can be done when the HWS is not hanging. In that case 
unmapping queues is perfectly safe. In the worst case it'll time out and 
dqm->is_hws_hang will be set as a result. I'm planning to add more 
checks later so that we can optionally wait in unmap_queues until a 
reset is done. We'll need that to do preemptions reliably while a GPU 
reset is in progress. So I need to either unmap the queues or be sure 
that HWS is hanging.

With yours and Oak's comments I realize, this is far from complete and 
more work is needed. But I still think this is an improvement.

Regards,
   Felix


>>>> +        unmap_queues_cpsch(dqm, 
>>>> KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>> +    hanging = dqm->is_hws_hang || dqm->is_resetting;
>>>>       dqm->sched_running = false;
>>>>       dqm_unlock(dqm);
>>>>         kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>>> -    pm_uninit(&dqm->packets);
>>>> +    pm_uninit(&dqm->packets, hanging);
>>>>         return 0;
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>> index 2d56dc534459..bae706462f96 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue 
>>>> *kq, struct kfd_dev *dev,
>>>>   }
>>>>     /* Uninitialize a kernel queue and free all its memory usages. */
>>>> -static void kq_uninitialize(struct kernel_queue *kq)
>>>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>>>   {
>>>> -    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>>>> +    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>>>>           kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>>>                       kq->queue->mqd,
>>>>                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct 
>>>> kfd_dev *dev,
>>>>       return NULL;
>>>>   }
>>>>   -void kernel_queue_uninit(struct kernel_queue *kq)
>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>>>   {
>>>> -    kq_uninitialize(kq);
>>>> +    kq_uninitialize(kq, hanging);
>>>>       kfree(kq);
>>>>   }
>>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>> index 6cabed06ef5d..dc406e6dee23 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct 
>>>> device_queue_manager *dqm)
>>>>       return 0;
>>>>   }
>>>>   -void pm_uninit(struct packet_manager *pm)
>>>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>>>   {
>>>>       mutex_destroy(&pm->lock);
>>>> -    kernel_queue_uninit(pm->priv_queue);
>>>> +    kernel_queue_uninit(pm->priv_queue, hanging);
>>>>   }
>>>>     int pm_send_set_resources(struct packet_manager *pm,
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
>>>> index 087e96838997..8ac680dc90f1 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> @@ -883,7 +883,7 @@ struct device_queue_manager 
>>>> *device_queue_manager_init(struct kfd_dev *dev);
>>>>   void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>>>   struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>>>                       enum kfd_queue_type type);
>>>> -void kernel_queue_uninit(struct kernel_queue *kq);
>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>>>   int kfd_process_vm_fault(struct device_queue_manager *dqm, 
>>>> unsigned int pasid);
>>>>     /* Process Queue Manager */
>>>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
>>>> kfd_vi_pm_funcs;
>>>>   extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>>>     int pm_init(struct packet_manager *pm, struct 
>>>> device_queue_manager *dqm);
>>>> -void pm_uninit(struct packet_manager *pm);
>>>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>>>   int pm_send_set_resources(struct packet_manager *pm,
>>>>                   struct scheduling_resources *res);
>>>>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
>>>> *dqm_queues);
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> index d3eacf72e8db..8fa856e6a03f 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct 
>>>> process_queue_manager *pqm, unsigned int qid)
>>>>           /* destroy kernel queue (DIQ) */
>>>>           dqm = pqn->kq->dev->dqm;
>>>>           dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>>> -        kernel_queue_uninit(pqn->kq);
>>>> +        kernel_queue_uninit(pqn->kq, false);
>>>>       }
>>>>         if (pqn->q) {
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&amp;sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&amp;reserved=0 
>>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
  2019-12-20 19:46         ` Felix Kuehling
@ 2019-12-20 22:09           ` shaoyunl
  0 siblings, 0 replies; 17+ messages in thread
From: shaoyunl @ 2019-12-20 22:09 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx

I agree this patch is a big improvement , I think we need this patch so 
SRIOV can put the  amdkfd_pre_reset in right place as bare metal mode . 
The further improvement can be in separate change .

This serial is reviewed by shaoyun.liu < shaoyun.liu@amd.com>


Regards

shaoyun.liu


On 2019-12-20 2:46 p.m., Felix Kuehling wrote:
> On 2019-12-20 14:31, shaoyunl wrote:
>> Can we use the  dqm_lock when we try to get the dqm->is_hw_hang and 
>> dqm->is_resetting inside function kq_uninitialize ?
>
> Spreading the DQM lock around is probably not a good idea. Then I'd 
> rather do more refactoring to move hqd_load and hqd_destroy out of the 
> kq_init/kq_uninit functions.
>
>
>>
>> I think more closer we check the status  to hqd_destroy it will be  
>> more accurate . It does look better with this logic if the status are 
>> changed after dqm unmap_queue call and  before we call hqd_destroy .
>>
>> Another comment in line.
>>
>> Regards
>>
>> shaoyun.liu
>>
>>
>>
>>
>> On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
>>> dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs 
>>> outside that lock protection. Therefore I opted to pass in the 
>>> hanging flag as a parameter. It also keeps the logic that decides 
>>> all of that inside the device queue manager, which I think is cleaner.
>>>
>>> I was trying to clean this up further by moving the 
>>> pm_init/pm_uninit out of the start_cpsch/stop_cpsch sequence, but 
>>> gave up on that idea when I found out that I can't create the kernel 
>>> queue in the DQM initialize function because dev->dqm isn't 
>>> initialized at that time yet.
>>>
>>> Regards,
>>>   Felix
>>>
>>> On 2019-12-20 10:56, shaoyunl wrote:
>>>> Looks like patch 2 is not related to this serial , but anyway .
>>>>
>>>> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
>>>>
>>>> For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
>>>> dqm->is_resetting  inside function kq_uninitialize.  so we don't 
>>>> need other interface change .
>>>>
>>>> I think even Inside that kq_uninitialize function , we still can 
>>>> get dqm as  kq->dev->dqm .
>>>>
>>>>
>>>> shaoyun.liu
>>>>
>>>>
>>>> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>>>>> Don't use the HWS if it's known to be hanging. In a reset also
>>>>> don't try to destroy the HIQ because that may hang on SRIOV if the
>>>>> KIQ is unresponsive.
>>>>>
>>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>> ---
>>>>>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 
>>>>> ++++++++----
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        | 8 ++++----
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      | 4 ++--
>>>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                | 4 ++--
>>>>>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   | 2 +-
>>>>>   5 files changed, 17 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct 
>>>>> device_queue_manager *dqm)
>>>>>   static int stop_nocpsch(struct device_queue_manager *dqm)
>>>>>   {
>>>>>       if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>> -        pm_uninit(&dqm->packets);
>>>>> +        pm_uninit(&dqm->packets, false);
>>>>>       dqm->sched_running = false;
>>>>>         return 0;
>>>>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct 
>>>>> device_queue_manager *dqm)
>>>>>       return 0;
>>>>>   fail_allocate_vidmem:
>>>>>   fail_set_sched_resources:
>>>>> -    pm_uninit(&dqm->packets);
>>>>> +    pm_uninit(&dqm->packets, false);
>>>>>   fail_packet_manager_init:
>>>>>       return retval;
>>>>>   }
>>>>>     static int stop_cpsch(struct device_queue_manager *dqm)
>>>>>   {
>>>>> +    bool hanging;
>>>>> +kq_uninitialize(
>>>>>
>>>>>       dqm_lock(dqm);
>>>>> -    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>>> +    if (!dqm->is_hws_hang)
>> [shaoyunl]  should we check is_resetting here as well . so we ignore 
>> the  unmap call even HWS still not  detect the hang but we know we 
>> currently in resetting  precedure
>
> GPU reset can be done when the HWS is not hanging. In that case 
> unmapping queues is perfectly safe. In the worst case it'll time out 
> and dqm->is_hws_hang will be set as a result. I'm planning to add more 
> checks later so that we can optionally wait in unmap_queues until a 
> reset is done. We'll need that to do preemptions reliably while a GPU 
> reset is in progress. So I need to either unmap the queues or be sure 
> that HWS is hanging.
>
> With yours and Oak's comments I realize, this is far from complete and 
> more work is needed. But I still think this is an improvement.
>
> Regards,
>   Felix
>
>
>>>>> +        unmap_queues_cpsch(dqm, 
>>>>> KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>>> +    hanging = dqm->is_hws_hang || dqm->is_resetting;
>>>>>       dqm->sched_running = false;
>>>>>       dqm_unlock(dqm);
>>>>>         kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>>>> -    pm_uninit(&dqm->packets);
>>>>> +    pm_uninit(&dqm->packets, hanging);
>>>>>         return 0;
>>>>>   }
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>>> index 2d56dc534459..bae706462f96 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue 
>>>>> *kq, struct kfd_dev *dev,
>>>>>   }
>>>>>     /* Uninitialize a kernel queue and free all its memory usages. */
>>>>> -static void kq_uninitialize(struct kernel_queue *kq)
>>>>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>>>>   {
>>>>> -    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>>>>> +    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && 
>>>>> !hanging)
>>>>>           kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>>>>                       kq->queue->mqd,
>>>>>                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct 
>>>>> kfd_dev *dev,
>>>>>       return NULL;
>>>>>   }
>>>>>   -void kernel_queue_uninit(struct kernel_queue *kq)
>>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>>>>   {
>>>>> -    kq_uninitialize(kq);
>>>>> +    kq_uninitialize(kq, hanging);
>>>>>       kfree(kq);
>>>>>   }
>>>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>>> index 6cabed06ef5d..dc406e6dee23 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, 
>>>>> struct device_queue_manager *dqm)
>>>>>       return 0;
>>>>>   }
>>>>>   -void pm_uninit(struct packet_manager *pm)
>>>>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>>>>   {
>>>>>       mutex_destroy(&pm->lock);
>>>>> -    kernel_queue_uninit(pm->priv_queue);
>>>>> +    kernel_queue_uninit(pm->priv_queue, hanging);
>>>>>   }
>>>>>     int pm_send_set_resources(struct packet_manager *pm,
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
>>>>> index 087e96838997..8ac680dc90f1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -883,7 +883,7 @@ struct device_queue_manager 
>>>>> *device_queue_manager_init(struct kfd_dev *dev);
>>>>>   void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>>>>   struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>>>>                       enum kfd_queue_type type);
>>>>> -void kernel_queue_uninit(struct kernel_queue *kq);
>>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>>>>   int kfd_process_vm_fault(struct device_queue_manager *dqm, 
>>>>> unsigned int pasid);
>>>>>     /* Process Queue Manager */
>>>>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
>>>>> kfd_vi_pm_funcs;
>>>>>   extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>>>>     int pm_init(struct packet_manager *pm, struct 
>>>>> device_queue_manager *dqm);
>>>>> -void pm_uninit(struct packet_manager *pm);
>>>>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>>>>   int pm_send_set_resources(struct packet_manager *pm,
>>>>>                   struct scheduling_resources *res);
>>>>>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
>>>>> *dqm_queues);
>>>>> diff --git 
>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> index d3eacf72e8db..8fa856e6a03f 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct 
>>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>           /* destroy kernel queue (DIQ) */
>>>>>           dqm = pqn->kq->dev->dqm;
>>>>>           dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>>>> -        kernel_queue_uninit(pqn->kq);
>>>>> +        kernel_queue_uninit(pqn->kq, false);
>>>>>       }
>>>>>         if (pqn->q) {
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&amp;sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&amp;reserved=0 
>>>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws
  2019-12-20  8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
                   ` (2 preceding siblings ...)
  2019-12-20  8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
@ 2020-01-02 13:39 ` Russell, Kent
  3 siblings, 0 replies; 17+ messages in thread
From: Russell, Kent @ 2020-01-02 13:39 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Kent Russell <kent.russell@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws

Reading from /sys/kernel/debug/kfd/hang_hws would cause a kernel oops because we didn't implement a read callback. Set the permission to write-only to prevent that.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 15c523027285..511712c2e382 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -93,7 +93,7 @@ void kfd_debugfs_init(void)
 			    kfd_debugfs_hqds_by_device, &kfd_debugfs_fops);
 	debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
 			    kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
-	debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+	debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
 			    NULL, &kfd_debugfs_hang_hws_fops);  }
 
--
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Ckent.russell%40amd.com%7C7dc16d27ccf646dd676608d78526df14%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274407198344&amp;sdata=joT5cPhL9glNqV85jTIqvygQ%2B6CSOprnadM8LzaRBQs%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH 2/4] drm/amdkfd: Remove unused variable
  2019-12-20  8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
@ 2020-01-02 13:40   ` Russell, Kent
  0 siblings, 0 replies; 17+ messages in thread
From: Russell, Kent @ 2020-01-02 13:40 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Kent Russell <kent.russell@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 2/4] drm/amdkfd: Remove unused variable

dqm->pipeline_mem wasn't used anywhere.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 -  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f7f6df40875e..558c0ad81848 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -930,7 +930,6 @@ static void uninitialize(struct device_queue_manager *dqm)
 	for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
 		kfree(dqm->mqd_mgrs[i]);
 	mutex_destroy(&dqm->lock_hidden);
-	kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
 }
 
 static int start_nocpsch(struct device_queue_manager *dqm) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a8c37e6da027..8991120c4fa2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -190,7 +190,6 @@ struct device_queue_manager {
 	/* the pasid mapping for each kfd vmid */
 	uint16_t		vmid_pasid[VMID_NUM];
 	uint64_t		pipelines_addr;
-	struct kfd_mem_obj	*pipeline_mem;
 	uint64_t		fence_gpu_addr;
 	unsigned int		*fence_addr;
 	struct kfd_mem_obj	*fence_mem;
--
2.24.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Ckent.russell%40amd.com%7C1c4037940973425f605808d78526de75%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274380882459&amp;sdata=0SF%2F%2FKC%2BEOK%2FzFNDwQBltE5%2F9euhi7IrbKNPp8dlDrQ%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-01-02 13:54 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-20  8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
2019-12-20  8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
2020-01-02 13:40   ` Russell, Kent
2019-12-20  8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
2019-12-20 17:18   ` Zeng, Oak
2019-12-20 17:49     ` Felix Kuehling
2019-12-20  8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
2019-12-20 10:03   ` Deng, Emily
2019-12-20 15:56   ` shaoyunl
2019-12-20 16:33     ` Felix Kuehling
2019-12-20 19:31       ` shaoyunl
2019-12-20 19:46         ` Felix Kuehling
2019-12-20 22:09           ` shaoyunl
2019-12-20 17:22   ` Zeng, Oak
2019-12-20 17:27     ` Felix Kuehling
2019-12-20 17:29       ` Zeng, Oak
2020-01-02 13:39 ` [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Russell, Kent

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.