* [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws
@ 2019-12-20 8:29 Felix Kuehling
2019-12-20 8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
` (3 more replies)
0 siblings, 4 replies; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 8:29 UTC (permalink / raw)
To: amd-gfx
Reading from /sys/kernel/debug/kfd/hang_hws would cause a kernel
oops because we didn't implement a read callback. Set the permission
to write-only to prevent that.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 15c523027285..511712c2e382 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -93,7 +93,7 @@ void kfd_debugfs_init(void)
kfd_debugfs_hqds_by_device, &kfd_debugfs_fops);
debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
- debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+ debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
NULL, &kfd_debugfs_hang_hws_fops);
}
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCH 2/4] drm/amdkfd: Remove unused variable
2019-12-20 8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
@ 2019-12-20 8:29 ` Felix Kuehling
2020-01-02 13:40 ` Russell, Kent
2019-12-20 8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
` (2 subsequent siblings)
3 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 8:29 UTC (permalink / raw)
To: amd-gfx
dqm->pipeline_mem wasn't used anywhere.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 -
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 1 -
2 files changed, 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f7f6df40875e..558c0ad81848 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -930,7 +930,6 @@ static void uninitialize(struct device_queue_manager *dqm)
for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
kfree(dqm->mqd_mgrs[i]);
mutex_destroy(&dqm->lock_hidden);
- kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
}
static int start_nocpsch(struct device_queue_manager *dqm)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a8c37e6da027..8991120c4fa2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -190,7 +190,6 @@ struct device_queue_manager {
/* the pasid mapping for each kfd vmid */
uint16_t vmid_pasid[VMID_NUM];
uint64_t pipelines_addr;
- struct kfd_mem_obj *pipeline_mem;
uint64_t fence_gpu_addr;
unsigned int *fence_addr;
struct kfd_mem_obj *fence_mem;
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
2019-12-20 8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
2019-12-20 8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
@ 2019-12-20 8:30 ` Felix Kuehling
2019-12-20 17:18 ` Zeng, Oak
2019-12-20 8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
2020-01-02 13:39 ` [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Russell, Kent
3 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 8:30 UTC (permalink / raw)
To: amd-gfx
Move HWS hand detection into unmap_queues_cpsch to catch hangs in all
cases. If this happens during a reset, don't schedule another reset
because the reset already in progress is expected to take care of it.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +++
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++-----
.../drm/amd/amdkfd/kfd_device_queue_manager.h | 2 ++
3 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c6b6901bbda3..2a9e40131735 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
{
if (!kfd->init_complete)
return 0;
+
+ kfd->dqm->ops.pre_reset(kfd->dqm);
+
kgd2kfd_suspend(kfd);
kfd_signal_reset_event(kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 558c0ad81848..a7e9ec1b3ce3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
return 0;
}
+static void pre_reset(struct device_queue_manager *dqm)
+{
+ dqm_lock(dqm);
+ dqm->is_resetting = true;
+ dqm_unlock(dqm);
+}
+
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q)
{
@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
dqm_lock(dqm);
/* clear hang status when driver try to start the hw scheduler */
dqm->is_hws_hang = false;
+ dqm->is_resetting = false;
dqm->sched_running = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm);
@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
/* should be timed out */
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
queue_preemption_timeout_ms);
- if (retval)
+ if (retval) {
+ pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+ dqm->is_hws_hang = true;
+ /* It's possible we're detecting a HWS hang in the
+ * middle of a GPU reset. No need to schedule another
+ * reset in this case.
+ */
+ if (!dqm->is_resetting)
+ schedule_work(&dqm->hw_exception_work);
return retval;
+ }
pm_release_ib(&dqm->packets);
dqm->active_runlist = false;
@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
if (dqm->is_hws_hang)
return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param);
- if (retval) {
- pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
- dqm->is_hws_hang = true;
- schedule_work(&dqm->hw_exception_work);
+ if (retval)
return retval;
- }
return map_queues_cpsch(dqm);
}
@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.initialize = initialize_cpsch;
dqm->ops.start = start_cpsch;
dqm->ops.stop = stop_cpsch;
+ dqm->ops.pre_reset = pre_reset;
dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process;
@@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
/* initialize dqm for no cp scheduling */
dqm->ops.start = start_nocpsch;
dqm->ops.stop = stop_nocpsch;
+ dqm->ops.pre_reset = pre_reset;
dqm->ops.create_queue = create_queue_nocpsch;
dqm->ops.destroy_queue = destroy_queue_nocpsch;
dqm->ops.update_queue = update_queue;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 8991120c4fa2..871d3b628d2d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -104,6 +104,7 @@ struct device_queue_manager_ops {
int (*initialize)(struct device_queue_manager *dqm);
int (*start)(struct device_queue_manager *dqm);
int (*stop)(struct device_queue_manager *dqm);
+ void (*pre_reset)(struct device_queue_manager *dqm);
void (*uninitialize)(struct device_queue_manager *dqm);
int (*create_kernel_queue)(struct device_queue_manager *dqm,
struct kernel_queue *kq,
@@ -198,6 +199,7 @@ struct device_queue_manager {
/* hw exception */
bool is_hws_hang;
+ bool is_resetting;
struct work_struct hw_exception_work;
struct kfd_mem_obj hiq_sdma_mqd;
bool sched_running;
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
2019-12-20 8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
2019-12-20 8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
@ 2019-12-20 8:30 ` Felix Kuehling
2019-12-20 10:03 ` Deng, Emily
` (2 more replies)
2020-01-02 13:39 ` [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Russell, Kent
3 siblings, 3 replies; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 8:30 UTC (permalink / raw)
To: amd-gfx
Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
.../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
5 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
static int stop_nocpsch(struct device_queue_manager *dqm)
{
if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
- pm_uninit(&dqm->packets);
+ pm_uninit(&dqm->packets, false);
dqm->sched_running = false;
return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
return 0;
fail_allocate_vidmem:
fail_set_sched_resources:
- pm_uninit(&dqm->packets);
+ pm_uninit(&dqm->packets, false);
fail_packet_manager_init:
return retval;
}
static int stop_cpsch(struct device_queue_manager *dqm)
{
+ bool hanging;
+
dqm_lock(dqm);
- unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+ if (!dqm->is_hws_hang)
+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+ hanging = dqm->is_hws_hang || dqm->is_resetting;
dqm->sched_running = false;
dqm_unlock(dqm);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
- pm_uninit(&dqm->packets);
+ pm_uninit(&dqm->packets, hanging);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2d56dc534459..bae706462f96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
}
/* Uninitialize a kernel queue and free all its memory usages. */
-static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
{
- if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+ if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
kq->queue->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
return NULL;
}
-void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
{
- kq_uninitialize(kq);
+ kq_uninitialize(kq, hanging);
kfree(kq);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 6cabed06ef5d..dc406e6dee23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
return 0;
}
-void pm_uninit(struct packet_manager *pm)
+void pm_uninit(struct packet_manager *pm, bool hanging)
{
mutex_destroy(&pm->lock);
- kernel_queue_uninit(pm->priv_queue);
+ kernel_queue_uninit(pm->priv_queue, hanging);
}
int pm_send_set_resources(struct packet_manager *pm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 087e96838997..8ac680dc90f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type);
-void kernel_queue_uninit(struct kernel_queue *kq);
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
/* Process Queue Manager */
@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
extern const struct packet_manager_funcs kfd_v9_pm_funcs;
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
-void pm_uninit(struct packet_manager *pm);
+void pm_uninit(struct packet_manager *pm, bool hanging);
int pm_send_set_resources(struct packet_manager *pm,
struct scheduling_resources *res);
int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index d3eacf72e8db..8fa856e6a03f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
/* destroy kernel queue (DIQ) */
dqm = pqn->kq->dev->dqm;
dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
- kernel_queue_uninit(pqn->kq);
+ kernel_queue_uninit(pqn->kq, false);
}
if (pqn->q) {
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
* RE: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
@ 2019-12-20 10:03 ` Deng, Emily
2019-12-20 15:56 ` shaoyunl
2019-12-20 17:22 ` Zeng, Oak
2 siblings, 0 replies; 17+ messages in thread
From: Deng, Emily @ 2019-12-20 10:03 UTC (permalink / raw)
To: Kuehling, Felix, amd-gfx
[AMD Official Use Only - Internal Distribution Only]
Series Tested-by: Emily Deng <Emily.Deng@amd.com> on sriov environment with vege10 about TDR-1, TDR-2 and TDR-3 test cases.
Best wishes
Emily Deng
>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix
>Kuehling
>Sent: Friday, December 20, 2019 4:30 PM
>To: amd-gfx@lists.freedesktop.org
>Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
>Don't use the HWS if it's known to be hanging. In a reset also don't try to
>destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
>Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
> 5 files changed, 17 insertions(+), 13 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>index a7e9ec1b3ce3..d7eb6ac37f62 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager
>*dqm) static int stop_nocpsch(struct device_queue_manager *dqm) {
> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>- pm_uninit(&dqm->packets);
>+ pm_uninit(&dqm->packets, false);
> dqm->sched_running = false;
>
> return 0;
>@@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>device_queue_manager *dqm)
> return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
>- pm_uninit(&dqm->packets);
>+ pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> return retval;
> }
>
> static int stop_cpsch(struct device_queue_manager *dqm) {
>+ bool hanging;
>+
> dqm_lock(dqm);
>- unmap_queues_cpsch(dqm,
>KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>+ if (!dqm->is_hws_hang)
>+ unmap_queues_cpsch(dqm,
>KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>+ hanging = dqm->is_hws_hang || dqm->is_resetting;
> dqm->sched_running = false;
> dqm_unlock(dqm);
>
> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>- pm_uninit(&dqm->packets);
>+ pm_uninit(&dqm->packets, hanging);
>
> return 0;
> }
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>index 2d56dc534459..bae706462f96 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct
>kfd_dev *dev, }
>
> /* Uninitialize a kernel queue and free all its memory usages. */ -static void
>kq_uninitialize(struct kernel_queue *kq)
>+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
> {
>- if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>+ if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> kq->queue->mqd,
>
> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev
>*dev,
> return NULL;
> }
>
>-void kernel_queue_uninit(struct kernel_queue *kq)
>+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
> {
>- kq_uninitialize(kq);
>+ kq_uninitialize(kq, hanging);
> kfree(kq);
> }
>
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>index 6cabed06ef5d..dc406e6dee23 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
>device_queue_manager *dqm)
> return 0;
> }
>
>-void pm_uninit(struct packet_manager *pm)
>+void pm_uninit(struct packet_manager *pm, bool hanging)
> {
> mutex_destroy(&pm->lock);
>- kernel_queue_uninit(pm->priv_queue);
>+ kernel_queue_uninit(pm->priv_queue, hanging);
> }
>
> int pm_send_set_resources(struct packet_manager *pm, diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>index 087e96838997..8ac680dc90f1 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>@@ -883,7 +883,7 @@ struct device_queue_manager
>*device_queue_manager_init(struct kfd_dev *dev); void
>device_queue_manager_uninit(struct device_queue_manager *dqm); struct
>kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> enum kfd_queue_type type);
>-void kernel_queue_uninit(struct kernel_queue *kq);
>+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int
>pasid);
>
> /* Process Queue Manager */
>@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>kfd_vi_pm_funcs; extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
>-void pm_uninit(struct packet_manager *pm);
>+void pm_uninit(struct packet_manager *pm, bool hanging);
> int pm_send_set_resources(struct packet_manager *pm,
> struct scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm, struct list_head
>*dqm_queues); diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>index d3eacf72e8db..8fa856e6a03f 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>process_queue_manager *pqm, unsigned int qid)
> /* destroy kernel queue (DIQ) */
> dqm = pqn->kq->dev->dqm;
> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>- kernel_queue_uninit(pqn->kq);
>+ kernel_queue_uninit(pqn->kq, false);
> }
>
> if (pqn->q) {
>--
>2.24.1
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx@lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.fre
>edesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=02%7C01%7CEmily.Deng%40amd.com%7C3c77bba4d40d4bc6b
>e8508d78526dd45%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C
>637124274794842900&sdata=vHNAs2FTkSpHYZ2TTux%2F66attN4lf5qSiP
>jnlBOM5y0%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
2019-12-20 10:03 ` Deng, Emily
@ 2019-12-20 15:56 ` shaoyunl
2019-12-20 16:33 ` Felix Kuehling
2019-12-20 17:22 ` Zeng, Oak
2 siblings, 1 reply; 17+ messages in thread
From: shaoyunl @ 2019-12-20 15:56 UTC (permalink / raw)
To: amd-gfx
Looks like patch 2 is not related to this serial , but anyway .
Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
For patch 4 , is it possible we directly check dqm->is_hws_hang ||
dqm->is_resetting inside function kq_uninitialize. so we don't need
other interface change .
I think even Inside that kq_uninitialize function , we still can get dqm
as kq->dev->dqm .
shaoyun.liu
On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
> Don't use the HWS if it's known to be hanging. In a reset also
> don't try to destroy the HIQ because that may hang on SRIOV if the
> KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
> 5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
> static int stop_nocpsch(struct device_queue_manager *dqm)
> {
> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> dqm->sched_running = false;
>
> return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
> return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> return retval;
> }
>
> static int stop_cpsch(struct device_queue_manager *dqm)
> {
> + bool hanging;
> +kq_uninitialize(
>
> dqm_lock(dqm);
> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + if (!dqm->is_hws_hang)
> + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + hanging = dqm->is_hws_hang || dqm->is_resetting;
> dqm->sched_running = false;
> dqm_unlock(dqm);
>
> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, hanging);
>
> return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
> }
>
> /* Uninitialize a kernel queue and free all its memory usages. */
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
> {
> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> kq->queue->mqd,
> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> return NULL;
> }
>
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
> {
> - kq_uninitialize(kq);
> + kq_uninitialize(kq, hanging);
> kfree(kq);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
> return 0;
> }
>
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
> {
> mutex_destroy(&pm->lock);
> - kernel_queue_uninit(pm->priv_queue);
> + kernel_queue_uninit(pm->priv_queue, hanging);
> }
>
> int pm_send_set_resources(struct packet_manager *pm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
> void device_queue_manager_uninit(struct device_queue_manager *dqm);
> struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
>
> /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
> extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
> -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
> int pm_send_set_resources(struct packet_manager *pm,
> struct scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> /* destroy kernel queue (DIQ) */
> dqm = pqn->kq->dev->dqm;
> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> - kernel_queue_uninit(pqn->kq);
> + kernel_queue_uninit(pqn->kq, false);
> }
>
> if (pqn->q) {
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 15:56 ` shaoyunl
@ 2019-12-20 16:33 ` Felix Kuehling
2019-12-20 19:31 ` shaoyunl
0 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 16:33 UTC (permalink / raw)
To: shaoyunl, amd-gfx
dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs
outside that lock protection. Therefore I opted to pass in the hanging
flag as a parameter. It also keeps the logic that decides all of that
inside the device queue manager, which I think is cleaner.
I was trying to clean this up further by moving the pm_init/pm_uninit
out of the start_cpsch/stop_cpsch sequence, but gave up on that idea
when I found out that I can't create the kernel queue in the DQM
initialize function because dev->dqm isn't initialized at that time yet.
Regards,
Felix
On 2019-12-20 10:56, shaoyunl wrote:
> Looks like patch 2 is not related to this serial , but anyway .
>
> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
>
> For patch 4 , is it possible we directly check dqm->is_hws_hang ||
> dqm->is_resetting inside function kq_uninitialize. so we don't need
> other interface change .
>
> I think even Inside that kq_uninitialize function , we still can get
> dqm as kq->dev->dqm .
>
>
> shaoyun.liu
>
>
> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>> Don't use the HWS if it's known to be hanging. In a reset also
>> don't try to destroy the HIQ because that may hang on SRIOV if the
>> KIQ is unresponsive.
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
>> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
>> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
>> 5 files changed, 17 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct
>> device_queue_manager *dqm)
>> static int stop_nocpsch(struct device_queue_manager *dqm)
>> {
>> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>> - pm_uninit(&dqm->packets);
>> + pm_uninit(&dqm->packets, false);
>> dqm->sched_running = false;
>> return 0;
>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>> device_queue_manager *dqm)
>> return 0;
>> fail_allocate_vidmem:
>> fail_set_sched_resources:
>> - pm_uninit(&dqm->packets);
>> + pm_uninit(&dqm->packets, false);
>> fail_packet_manager_init:
>> return retval;
>> }
>> static int stop_cpsch(struct device_queue_manager *dqm)
>> {
>> + bool hanging;
>> +kq_uninitialize(
>>
>> dqm_lock(dqm);
>> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> + if (!dqm->is_hws_hang)
>> + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> + hanging = dqm->is_hws_hang || dqm->is_resetting;
>> dqm->sched_running = false;
>> dqm_unlock(dqm);
>> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>> - pm_uninit(&dqm->packets);
>> + pm_uninit(&dqm->packets, hanging);
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> index 2d56dc534459..bae706462f96 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue
>> *kq, struct kfd_dev *dev,
>> }
>> /* Uninitialize a kernel queue and free all its memory usages. */
>> -static void kq_uninitialize(struct kernel_queue *kq)
>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>> {
>> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>> kq->queue->mqd,
>> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct
>> kfd_dev *dev,
>> return NULL;
>> }
>> -void kernel_queue_uninit(struct kernel_queue *kq)
>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>> {
>> - kq_uninitialize(kq);
>> + kq_uninitialize(kq, hanging);
>> kfree(kq);
>> }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> index 6cabed06ef5d..dc406e6dee23 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
>> device_queue_manager *dqm)
>> return 0;
>> }
>> -void pm_uninit(struct packet_manager *pm)
>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>> {
>> mutex_destroy(&pm->lock);
>> - kernel_queue_uninit(pm->priv_queue);
>> + kernel_queue_uninit(pm->priv_queue, hanging);
>> }
>> int pm_send_set_resources(struct packet_manager *pm,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 087e96838997..8ac680dc90f1 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -883,7 +883,7 @@ struct device_queue_manager
>> *device_queue_manager_init(struct kfd_dev *dev);
>> void device_queue_manager_uninit(struct device_queue_manager *dqm);
>> struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>> enum kfd_queue_type type);
>> -void kernel_queue_uninit(struct kernel_queue *kq);
>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned
>> int pasid);
>> /* Process Queue Manager */
>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>> kfd_vi_pm_funcs;
>> extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>> int pm_init(struct packet_manager *pm, struct
>> device_queue_manager *dqm);
>> -void pm_uninit(struct packet_manager *pm);
>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>> int pm_send_set_resources(struct packet_manager *pm,
>> struct scheduling_resources *res);
>> int pm_send_runlist(struct packet_manager *pm, struct list_head
>> *dqm_queues);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index d3eacf72e8db..8fa856e6a03f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>> process_queue_manager *pqm, unsigned int qid)
>> /* destroy kernel queue (DIQ) */
>> dqm = pqn->kq->dev->dqm;
>> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>> - kernel_queue_uninit(pqn->kq);
>> + kernel_queue_uninit(pqn->kq, false);
>> }
>> if (pqn->q) {
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&reserved=0
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
2019-12-20 8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
@ 2019-12-20 17:18 ` Zeng, Oak
2019-12-20 17:49 ` Felix Kuehling
0 siblings, 1 reply; 17+ messages in thread
From: Zeng, Oak @ 2019-12-20 17:18 UTC (permalink / raw)
To: Kuehling, Felix, amd-gfx
[AMD Official Use Only - Internal Distribution Only]
With this improvement, it is still possible that two reset be scheduled. There is a period of time after HWS hang and before kfd pre-reset is called, during which, if a thread already passed the is_hws_hang check but was scheduled out, then it will also schedule another reset. The whole sequence is:
Thread 1: call unmap_queues_cpsch, pass the is_hws_hang, scheduled out before sending unmap command to HWS
Thread 2: send unmap to HWS ->hang, schedule a reset
Thread1: before the reset worker thread is run(resetting is still false), thread1 continus, another reset is scheduled.
Regards,
Oak
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
Move HWS hand detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +++
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++----- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 2 ++
3 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c6b6901bbda3..2a9e40131735 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) {
if (!kfd->init_complete)
return 0;
+
+ kfd->dqm->ops.pre_reset(kfd->dqm);
+
kgd2kfd_suspend(kfd);
kfd_signal_reset_event(kfd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 558c0ad81848..a7e9ec1b3ce3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
return 0;
}
+static void pre_reset(struct device_queue_manager *dqm) {
+ dqm_lock(dqm);
+ dqm->is_resetting = true;
+ dqm_unlock(dqm);
+}
+
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q)
{
@@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
dqm_lock(dqm);
/* clear hang status when driver try to start the hw scheduler */
dqm->is_hws_hang = false;
+ dqm->is_resetting = false;
dqm->sched_running = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm);
@@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
/* should be timed out */
retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
queue_preemption_timeout_ms);
- if (retval)
+ if (retval) {
+ pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+ dqm->is_hws_hang = true;
+ /* It's possible we're detecting a HWS hang in the
+ * middle of a GPU reset. No need to schedule another
+ * reset in this case.
+ */
+ if (!dqm->is_resetting)
+ schedule_work(&dqm->hw_exception_work);
return retval;
+ }
pm_release_ib(&dqm->packets);
dqm->active_runlist = false;
@@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
if (dqm->is_hws_hang)
return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param);
- if (retval) {
- pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
- dqm->is_hws_hang = true;
- schedule_work(&dqm->hw_exception_work);
+ if (retval)
return retval;
- }
return map_queues_cpsch(dqm);
}
@@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
dqm->ops.initialize = initialize_cpsch;
dqm->ops.start = start_cpsch;
dqm->ops.stop = stop_cpsch;
+ dqm->ops.pre_reset = pre_reset;
dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process; @@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
/* initialize dqm for no cp scheduling */
dqm->ops.start = start_nocpsch;
dqm->ops.stop = stop_nocpsch;
+ dqm->ops.pre_reset = pre_reset;
dqm->ops.create_queue = create_queue_nocpsch;
dqm->ops.destroy_queue = destroy_queue_nocpsch;
dqm->ops.update_queue = update_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 8991120c4fa2..871d3b628d2d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -104,6 +104,7 @@ struct device_queue_manager_ops {
int (*initialize)(struct device_queue_manager *dqm);
int (*start)(struct device_queue_manager *dqm);
int (*stop)(struct device_queue_manager *dqm);
+ void (*pre_reset)(struct device_queue_manager *dqm);
void (*uninitialize)(struct device_queue_manager *dqm);
int (*create_kernel_queue)(struct device_queue_manager *dqm,
struct kernel_queue *kq,
@@ -198,6 +199,7 @@ struct device_queue_manager {
/* hw exception */
bool is_hws_hang;
+ bool is_resetting;
struct work_struct hw_exception_work;
struct kfd_mem_obj hiq_sdma_mqd;
bool sched_running;
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Coak.zeng%40amd.com%7Ca59ace5396cb46ed384708d78526df99%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274434007022&sdata=g%2B57MBWpTbFzbchp6%2Bi2dfmUzYXlsf77InUj3R1XaaY%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
* RE: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
2019-12-20 10:03 ` Deng, Emily
2019-12-20 15:56 ` shaoyunl
@ 2019-12-20 17:22 ` Zeng, Oak
2019-12-20 17:27 ` Felix Kuehling
2 siblings, 1 reply; 17+ messages in thread
From: Zeng, Oak @ 2019-12-20 17:22 UTC (permalink / raw)
To: Kuehling, Felix, amd-gfx
[AMD Official Use Only - Internal Distribution Only]
Regards,
Oak
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
Don't use the HWS if it's known to be hanging. In a reset also don't try to destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
.../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
5 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm) static int stop_nocpsch(struct device_queue_manager *dqm) {
if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
- pm_uninit(&dqm->packets);
+ pm_uninit(&dqm->packets, false);
dqm->sched_running = false;
return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
return 0;
fail_allocate_vidmem:
fail_set_sched_resources:
- pm_uninit(&dqm->packets);
+ pm_uninit(&dqm->packets, false);
fail_packet_manager_init:
return retval;
}
static int stop_cpsch(struct device_queue_manager *dqm) {
+ bool hanging;
+
dqm_lock(dqm);
- unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+ if (!dqm->is_hws_hang)
+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+ hanging = dqm->is_hws_hang || dqm->is_resetting;
[Oak] I don't think dqm->is_resetting is necessary. If is_resetting is true, is_hws_hang is always true. Those two flags are always the same except a period during which hws hang is detected but kfd_pre_reset is not called. In this period, hang is true but resetting is false, so "||resetting" doesn't help.
Also see my comment on the 3rd patch.
dqm->sched_running = false;
dqm_unlock(dqm);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
- pm_uninit(&dqm->packets);
+ pm_uninit(&dqm->packets, hanging);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2d56dc534459..bae706462f96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev, }
/* Uninitialize a kernel queue and free all its memory usages. */ -static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
{
- if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+ if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
kq->queue->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
return NULL;
}
-void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
{
- kq_uninitialize(kq);
+ kq_uninitialize(kq, hanging);
kfree(kq);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 6cabed06ef5d..dc406e6dee23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
return 0;
}
-void pm_uninit(struct packet_manager *pm)
+void pm_uninit(struct packet_manager *pm, bool hanging)
{
mutex_destroy(&pm->lock);
- kernel_queue_uninit(pm->priv_queue);
+ kernel_queue_uninit(pm->priv_queue, hanging);
}
int pm_send_set_resources(struct packet_manager *pm, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 087e96838997..8ac680dc90f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type);
-void kernel_queue_uninit(struct kernel_queue *kq);
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
/* Process Queue Manager */
@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs; extern const struct packet_manager_funcs kfd_v9_pm_funcs;
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -void pm_uninit(struct packet_manager *pm);
+void pm_uninit(struct packet_manager *pm, bool hanging);
int pm_send_set_resources(struct packet_manager *pm,
struct scheduling_resources *res);
int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index d3eacf72e8db..8fa856e6a03f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
/* destroy kernel queue (DIQ) */
dqm = pqn->kq->dev->dqm;
dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
- kernel_queue_uninit(pqn->kq);
+ kernel_queue_uninit(pqn->kq, false);
}
if (pqn->q) {
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Coak.zeng%40amd.com%7C7602eef96b0545baac8608d78526dde0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274407587881&sdata=BlPgWgLi%2FrtkcPQLu%2FbK0dOrvg6qm4IsGVfuoUo%2B%2B1g%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 17:22 ` Zeng, Oak
@ 2019-12-20 17:27 ` Felix Kuehling
2019-12-20 17:29 ` Zeng, Oak
0 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 17:27 UTC (permalink / raw)
To: Zeng, Oak, amd-gfx
On 2019-12-20 12:22, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: amd-gfx@lists.freedesktop.org
> Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
> Don't use the HWS if it's known to be hanging. In a reset also don't try to destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
> 5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm) static int stop_nocpsch(struct device_queue_manager *dqm) {
> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> dqm->sched_running = false;
>
> return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
> return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> return retval;
> }
>
> static int stop_cpsch(struct device_queue_manager *dqm) {
> + bool hanging;
> +
> dqm_lock(dqm);
> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + if (!dqm->is_hws_hang)
> + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + hanging = dqm->is_hws_hang || dqm->is_resetting;
> [Oak] I don't think dqm->is_resetting is necessary. If is_resetting is true, is_hws_hang is always true. Those two flags are always the same except a period during which hws hang is detected but kfd_pre_reset is not called. In this period, hang is true but resetting is false, so "||resetting" doesn't help.
This is not necessarily true. A GPU reset can be caused by amdgpu for
example when the graphics engine is hanging. In that case HWS isn't
necessarily hanging. I added "|| resetting" here to avoid touching
hardware in an unknown state in pm_uninit=>kq_uninitialize in this case.
Regards,
Felix
>
> Also see my comment on the 3rd patch.
>
> dqm->sched_running = false;
> dqm_unlock(dqm);
>
> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, hanging);
>
> return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev, }
>
> /* Uninitialize a kernel queue and free all its memory usages. */ -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
> {
> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> kq->queue->mqd,
> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> return NULL;
> }
>
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
> {
> - kq_uninitialize(kq);
> + kq_uninitialize(kq, hanging);
> kfree(kq);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
> return 0;
> }
>
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
> {
> mutex_destroy(&pm->lock);
> - kernel_queue_uninit(pm->priv_queue);
> + kernel_queue_uninit(pm->priv_queue, hanging);
> }
>
> int pm_send_set_resources(struct packet_manager *pm, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
>
> /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs; extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
> int pm_send_set_resources(struct packet_manager *pm,
> struct scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> /* destroy kernel queue (DIQ) */
> dqm = pqn->kq->dev->dqm;
> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> - kernel_queue_uninit(pqn->kq);
> + kernel_queue_uninit(pqn->kq, false);
> }
>
> if (pqn->q) {
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Coak.zeng%40amd.com%7C7602eef96b0545baac8608d78526dde0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274407587881&sdata=BlPgWgLi%2FrtkcPQLu%2FbK0dOrvg6qm4IsGVfuoUo%2B%2B1g%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 17:27 ` Felix Kuehling
@ 2019-12-20 17:29 ` Zeng, Oak
0 siblings, 0 replies; 17+ messages in thread
From: Zeng, Oak @ 2019-12-20 17:29 UTC (permalink / raw)
To: Kuehling, Felix, amd-gfx
[AMD Official Use Only - Internal Distribution Only]
I see. Thank you Felix for the explanation.
Regards,
Oak
-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@amd.com>
Sent: Friday, December 20, 2019 12:28 PM
To: Zeng, Oak <Oak.Zeng@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
On 2019-12-20 12:22, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: amd-gfx@lists.freedesktop.org
> Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
> Don't use the HWS if it's known to be hanging. In a reset also don't try to destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
> 5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm) static int stop_nocpsch(struct device_queue_manager *dqm) {
> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> dqm->sched_running = false;
>
> return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
> return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> return retval;
> }
>
> static int stop_cpsch(struct device_queue_manager *dqm) {
> + bool hanging;
> +
> dqm_lock(dqm);
> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + if (!dqm->is_hws_hang)
> + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + hanging = dqm->is_hws_hang || dqm->is_resetting;
> [Oak] I don't think dqm->is_resetting is necessary. If is_resetting is true, is_hws_hang is always true. Those two flags are always the same except a period during which hws hang is detected but kfd_pre_reset is not called. In this period, hang is true but resetting is false, so "||resetting" doesn't help.
This is not necessarily true. A GPU reset can be caused by amdgpu for example when the graphics engine is hanging. In that case HWS isn't necessarily hanging. I added "|| resetting" here to avoid touching hardware in an unknown state in pm_uninit=>kq_uninitialize in this case.
Regards,
Felix
>
> Also see my comment on the 3rd patch.
>
> dqm->sched_running = false;
> dqm_unlock(dqm);
>
> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, hanging);
>
> return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq,
> struct kfd_dev *dev, }
>
> /* Uninitialize a kernel queue and free all its memory usages. */
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
> {
> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> kq->queue->mqd,
> KFD_PREEMPT_TYPE_WAVEFRONT_RESET, @@ -337,9 +337,9 @@ struct
> kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> return NULL;
> }
>
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
> {
> - kq_uninitialize(kq);
> + kq_uninitialize(kq, hanging);
> kfree(kq);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
> return 0;
> }
>
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
> {
> mutex_destroy(&pm->lock);
> - kernel_queue_uninit(pm->priv_queue);
> + kernel_queue_uninit(pm->priv_queue, hanging);
> }
>
> int pm_send_set_resources(struct packet_manager *pm, diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned
> int pasid);
>
> /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
> kfd_vi_pm_funcs; extern const struct packet_manager_funcs
> kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct device_queue_manager
> *dqm); -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
> int pm_send_set_resources(struct packet_manager *pm,
> struct scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm, struct list_head
> *dqm_queues); diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> /* destroy kernel queue (DIQ) */
> dqm = pqn->kq->dev->dqm;
> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> - kernel_queue_uninit(pqn->kq);
> + kernel_queue_uninit(pqn->kq, false);
> }
>
> if (pqn->q) {
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Coa
> k.zeng%40amd.com%7C7602eef96b0545baac8608d78526dde0%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637124274407587881&sdata=BlPgWgLi%2Frtk
> cPQLu%2FbK0dOrvg6qm4IsGVfuoUo%2B%2B1g%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
2019-12-20 17:18 ` Zeng, Oak
@ 2019-12-20 17:49 ` Felix Kuehling
0 siblings, 0 replies; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 17:49 UTC (permalink / raw)
To: Zeng, Oak, amd-gfx
On 2019-12-20 12:18, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
> With this improvement, it is still possible that two reset be scheduled. There is a period of time after HWS hang and before kfd pre-reset is called, during which, if a thread already passed the is_hws_hang check but was scheduled out, then it will also schedule another reset. The whole sequence is:
>
> Thread 1: call unmap_queues_cpsch, pass the is_hws_hang, scheduled out before sending unmap command to HWS
> Thread 2: send unmap to HWS ->hang, schedule a reset
> Thread1: before the reset worker thread is run(resetting is still false), thread1 continus, another reset is scheduled.
Rescheduling the reset worker "before the reset worker thread is run"
results in the reset worker only running once. The work item can be on
the queue twice at the same time. The more interesting case is if the
reset worker is already running but hasn't called
amdgpu_amdkfd_pre_reset yet. In that case we may end up scheduling a
second reset. I can't think of a good way to prevent this race.
It gets more confusing when you consider that GPU resets can be
triggered outside of KFD. So a reset can start outside a KFD reset
worker thread and KFD can schedule another reset. I think the only place
to really prevent this type of race would be in
amdgpu_device_should_recover_gpu with some kind of reset decision flag
protected by a lock.
I could also try to get rid of the worker thread for GPU resets in KFD.
I think we created the worker to avoid locking issues, but there may be
better ways to do this.
Regards,
Felix
>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: amd-gfx@lists.freedesktop.org
> Subject: [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling
>
> Move HWS hand detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +++
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++++----- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 2 ++
> 3 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c6b6901bbda3..2a9e40131735 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) {
> if (!kfd->init_complete)
> return 0;
> +
> + kfd->dqm->ops.pre_reset(kfd->dqm);
> +
> kgd2kfd_suspend(kfd);
>
> kfd_signal_reset_event(kfd);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 558c0ad81848..a7e9ec1b3ce3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
> return 0;
> }
>
> +static void pre_reset(struct device_queue_manager *dqm) {
> + dqm_lock(dqm);
> + dqm->is_resetting = true;
> + dqm_unlock(dqm);
> +}
> +
> static int allocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q)
> {
> @@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
> dqm_lock(dqm);
> /* clear hang status when driver try to start the hw scheduler */
> dqm->is_hws_hang = false;
> + dqm->is_resetting = false;
> dqm->sched_running = true;
> execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> dqm_unlock(dqm);
> @@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
> /* should be timed out */
> retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
> queue_preemption_timeout_ms);
> - if (retval)
> + if (retval) {
> + pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
> + dqm->is_hws_hang = true;
> + /* It's possible we're detecting a HWS hang in the
> + * middle of a GPU reset. No need to schedule another
> + * reset in this case.
> + */
> + if (!dqm->is_resetting)
> + schedule_work(&dqm->hw_exception_work);
> return retval;
> + }
>
> pm_release_ib(&dqm->packets);
> dqm->active_runlist = false;
> @@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
> if (dqm->is_hws_hang)
> return -EIO;
> retval = unmap_queues_cpsch(dqm, filter, filter_param);
> - if (retval) {
> - pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
> - dqm->is_hws_hang = true;
> - schedule_work(&dqm->hw_exception_work);
> + if (retval)
> return retval;
> - }
>
> return map_queues_cpsch(dqm);
> }
> @@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
> dqm->ops.initialize = initialize_cpsch;
> dqm->ops.start = start_cpsch;
> dqm->ops.stop = stop_cpsch;
> + dqm->ops.pre_reset = pre_reset;
> dqm->ops.destroy_queue = destroy_queue_cpsch;
> dqm->ops.update_queue = update_queue;
> dqm->ops.register_process = register_process; @@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
> /* initialize dqm for no cp scheduling */
> dqm->ops.start = start_nocpsch;
> dqm->ops.stop = stop_nocpsch;
> + dqm->ops.pre_reset = pre_reset;
> dqm->ops.create_queue = create_queue_nocpsch;
> dqm->ops.destroy_queue = destroy_queue_nocpsch;
> dqm->ops.update_queue = update_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 8991120c4fa2..871d3b628d2d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -104,6 +104,7 @@ struct device_queue_manager_ops {
> int (*initialize)(struct device_queue_manager *dqm);
> int (*start)(struct device_queue_manager *dqm);
> int (*stop)(struct device_queue_manager *dqm);
> + void (*pre_reset)(struct device_queue_manager *dqm);
> void (*uninitialize)(struct device_queue_manager *dqm);
> int (*create_kernel_queue)(struct device_queue_manager *dqm,
> struct kernel_queue *kq,
> @@ -198,6 +199,7 @@ struct device_queue_manager {
>
> /* hw exception */
> bool is_hws_hang;
> + bool is_resetting;
> struct work_struct hw_exception_work;
> struct kfd_mem_obj hiq_sdma_mqd;
> bool sched_running;
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Coak.zeng%40amd.com%7Ca59ace5396cb46ed384708d78526df99%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274434007022&sdata=g%2B57MBWpTbFzbchp6%2Bi2dfmUzYXlsf77InUj3R1XaaY%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 16:33 ` Felix Kuehling
@ 2019-12-20 19:31 ` shaoyunl
2019-12-20 19:46 ` Felix Kuehling
0 siblings, 1 reply; 17+ messages in thread
From: shaoyunl @ 2019-12-20 19:31 UTC (permalink / raw)
To: Felix Kuehling, amd-gfx
Can we use the dqm_lock when we try to get the dqm->is_hw_hang and
dqm->is_resetting inside function kq_uninitialize ?
I think more closer we check the status to hqd_destroy it will be more
accurate . It does look better with this logic if the status are changed
after dqm unmap_queue call and before we call hqd_destroy .
Another comment in line.
Regards
shaoyun.liu
On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
> dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs
> outside that lock protection. Therefore I opted to pass in the hanging
> flag as a parameter. It also keeps the logic that decides all of that
> inside the device queue manager, which I think is cleaner.
>
> I was trying to clean this up further by moving the pm_init/pm_uninit
> out of the start_cpsch/stop_cpsch sequence, but gave up on that idea
> when I found out that I can't create the kernel queue in the DQM
> initialize function because dev->dqm isn't initialized at that time yet.
>
> Regards,
> Felix
>
> On 2019-12-20 10:56, shaoyunl wrote:
>> Looks like patch 2 is not related to this serial , but anyway .
>>
>> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
>>
>> For patch 4 , is it possible we directly check dqm->is_hws_hang ||
>> dqm->is_resetting inside function kq_uninitialize. so we don't need
>> other interface change .
>>
>> I think even Inside that kq_uninitialize function , we still can get
>> dqm as kq->dev->dqm .
>>
>>
>> shaoyun.liu
>>
>>
>> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>>> Don't use the HWS if it's known to be hanging. In a reset also
>>> don't try to destroy the HIQ because that may hang on SRIOV if the
>>> KIQ is unresponsive.
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12
>>> ++++++++----
>>> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
>>> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
>>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
>>> 5 files changed, 17 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct
>>> device_queue_manager *dqm)
>>> static int stop_nocpsch(struct device_queue_manager *dqm)
>>> {
>>> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>> - pm_uninit(&dqm->packets);
>>> + pm_uninit(&dqm->packets, false);
>>> dqm->sched_running = false;
>>> return 0;
>>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>>> device_queue_manager *dqm)
>>> return 0;
>>> fail_allocate_vidmem:
>>> fail_set_sched_resources:
>>> - pm_uninit(&dqm->packets);
>>> + pm_uninit(&dqm->packets, false);
>>> fail_packet_manager_init:
>>> return retval;
>>> }
>>> static int stop_cpsch(struct device_queue_manager *dqm)
>>> {
>>> + bool hanging;
>>> +kq_uninitialize(
>>>
>>> dqm_lock(dqm);
>>> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>> + if (!dqm->is_hws_hang)
[shaoyunl] should we check is_resetting here as well . so we ignore
the unmap call even HWS still not detect the hang but we know we
currently in resetting precedure
>>> + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
>>> 0);
>>> + hanging = dqm->is_hws_hang || dqm->is_resetting;
>>> dqm->sched_running = false;
>>> dqm_unlock(dqm);
>>> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>> - pm_uninit(&dqm->packets);
>>> + pm_uninit(&dqm->packets, hanging);
>>> return 0;
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> index 2d56dc534459..bae706462f96 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue
>>> *kq, struct kfd_dev *dev,
>>> }
>>> /* Uninitialize a kernel queue and free all its memory usages. */
>>> -static void kq_uninitialize(struct kernel_queue *kq)
>>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>> {
>>> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>>> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>>> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>> kq->queue->mqd,
>>> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct
>>> kfd_dev *dev,
>>> return NULL;
>>> }
>>> -void kernel_queue_uninit(struct kernel_queue *kq)
>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>> {
>>> - kq_uninitialize(kq);
>>> + kq_uninitialize(kq, hanging);
>>> kfree(kq);
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> index 6cabed06ef5d..dc406e6dee23 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
>>> device_queue_manager *dqm)
>>> return 0;
>>> }
>>> -void pm_uninit(struct packet_manager *pm)
>>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>> {
>>> mutex_destroy(&pm->lock);
>>> - kernel_queue_uninit(pm->priv_queue);
>>> + kernel_queue_uninit(pm->priv_queue, hanging);
>>> }
>>> int pm_send_set_resources(struct packet_manager *pm,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
>>> index 087e96838997..8ac680dc90f1 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -883,7 +883,7 @@ struct device_queue_manager
>>> *device_queue_manager_init(struct kfd_dev *dev);
>>> void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>> struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>> enum kfd_queue_type type);
>>> -void kernel_queue_uninit(struct kernel_queue *kq);
>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>> int kfd_process_vm_fault(struct device_queue_manager *dqm,
>>> unsigned int pasid);
>>> /* Process Queue Manager */
>>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>>> kfd_vi_pm_funcs;
>>> extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>> int pm_init(struct packet_manager *pm, struct
>>> device_queue_manager *dqm);
>>> -void pm_uninit(struct packet_manager *pm);
>>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>> int pm_send_set_resources(struct packet_manager *pm,
>>> struct scheduling_resources *res);
>>> int pm_send_runlist(struct packet_manager *pm, struct list_head
>>> *dqm_queues);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index d3eacf72e8db..8fa856e6a03f 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>>> process_queue_manager *pqm, unsigned int qid)
>>> /* destroy kernel queue (DIQ) */
>>> dqm = pqn->kq->dev->dqm;
>>> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>> - kernel_queue_uninit(pqn->kq);
>>> + kernel_queue_uninit(pqn->kq, false);
>>> }
>>> if (pqn->q) {
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&reserved=0
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 19:31 ` shaoyunl
@ 2019-12-20 19:46 ` Felix Kuehling
2019-12-20 22:09 ` shaoyunl
0 siblings, 1 reply; 17+ messages in thread
From: Felix Kuehling @ 2019-12-20 19:46 UTC (permalink / raw)
To: shaoyunl, amd-gfx
On 2019-12-20 14:31, shaoyunl wrote:
> Can we use the dqm_lock when we try to get the dqm->is_hw_hang and
> dqm->is_resetting inside function kq_uninitialize ?
Spreading the DQM lock around is probably not a good idea. Then I'd
rather do more refactoring to move hqd_load and hqd_destroy out of the
kq_init/kq_uninit functions.
>
> I think more closer we check the status to hqd_destroy it will be
> more accurate . It does look better with this logic if the status are
> changed after dqm unmap_queue call and before we call hqd_destroy .
>
> Another comment in line.
>
> Regards
>
> shaoyun.liu
>
>
>
>
> On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
>> dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs
>> outside that lock protection. Therefore I opted to pass in the
>> hanging flag as a parameter. It also keeps the logic that decides all
>> of that inside the device queue manager, which I think is cleaner.
>>
>> I was trying to clean this up further by moving the pm_init/pm_uninit
>> out of the start_cpsch/stop_cpsch sequence, but gave up on that idea
>> when I found out that I can't create the kernel queue in the DQM
>> initialize function because dev->dqm isn't initialized at that time yet.
>>
>> Regards,
>> Felix
>>
>> On 2019-12-20 10:56, shaoyunl wrote:
>>> Looks like patch 2 is not related to this serial , but anyway .
>>>
>>> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
>>>
>>> For patch 4 , is it possible we directly check dqm->is_hws_hang ||
>>> dqm->is_resetting inside function kq_uninitialize. so we don't
>>> need other interface change .
>>>
>>> I think even Inside that kq_uninitialize function , we still can get
>>> dqm as kq->dev->dqm .
>>>
>>>
>>> shaoyun.liu
>>>
>>>
>>> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>>>> Don't use the HWS if it's known to be hanging. In a reset also
>>>> don't try to destroy the HIQ because that may hang on SRIOV if the
>>>> KIQ is unresponsive.
>>>>
>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>> ---
>>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12
>>>> ++++++++----
>>>> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
>>>> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
>>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
>>>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
>>>> 5 files changed, 17 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct
>>>> device_queue_manager *dqm)
>>>> static int stop_nocpsch(struct device_queue_manager *dqm)
>>>> {
>>>> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>> - pm_uninit(&dqm->packets);
>>>> + pm_uninit(&dqm->packets, false);
>>>> dqm->sched_running = false;
>>>> return 0;
>>>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>>>> device_queue_manager *dqm)
>>>> return 0;
>>>> fail_allocate_vidmem:
>>>> fail_set_sched_resources:
>>>> - pm_uninit(&dqm->packets);
>>>> + pm_uninit(&dqm->packets, false);
>>>> fail_packet_manager_init:
>>>> return retval;
>>>> }
>>>> static int stop_cpsch(struct device_queue_manager *dqm)
>>>> {
>>>> + bool hanging;
>>>> +kq_uninitialize(
>>>>
>>>> dqm_lock(dqm);
>>>> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>> + if (!dqm->is_hws_hang)
> [shaoyunl] should we check is_resetting here as well . so we ignore
> the unmap call even HWS still not detect the hang but we know we
> currently in resetting precedure
GPU reset can be done when the HWS is not hanging. In that case
unmapping queues is perfectly safe. In the worst case it'll time out and
dqm->is_hws_hang will be set as a result. I'm planning to add more
checks later so that we can optionally wait in unmap_queues until a
reset is done. We'll need that to do preemptions reliably while a GPU
reset is in progress. So I need to either unmap the queues or be sure
that HWS is hanging.
With yours and Oak's comments I realize, this is far from complete and
more work is needed. But I still think this is an improvement.
Regards,
Felix
>>>> + unmap_queues_cpsch(dqm,
>>>> KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>> + hanging = dqm->is_hws_hang || dqm->is_resetting;
>>>> dqm->sched_running = false;
>>>> dqm_unlock(dqm);
>>>> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>>> - pm_uninit(&dqm->packets);
>>>> + pm_uninit(&dqm->packets, hanging);
>>>> return 0;
>>>> }
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>> index 2d56dc534459..bae706462f96 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue
>>>> *kq, struct kfd_dev *dev,
>>>> }
>>>> /* Uninitialize a kernel queue and free all its memory usages. */
>>>> -static void kq_uninitialize(struct kernel_queue *kq)
>>>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>>> {
>>>> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>>>> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>>>> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>>> kq->queue->mqd,
>>>> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct
>>>> kfd_dev *dev,
>>>> return NULL;
>>>> }
>>>> -void kernel_queue_uninit(struct kernel_queue *kq)
>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>>> {
>>>> - kq_uninitialize(kq);
>>>> + kq_uninitialize(kq, hanging);
>>>> kfree(kq);
>>>> }
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>> index 6cabed06ef5d..dc406e6dee23 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
>>>> device_queue_manager *dqm)
>>>> return 0;
>>>> }
>>>> -void pm_uninit(struct packet_manager *pm)
>>>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>>> {
>>>> mutex_destroy(&pm->lock);
>>>> - kernel_queue_uninit(pm->priv_queue);
>>>> + kernel_queue_uninit(pm->priv_queue, hanging);
>>>> }
>>>> int pm_send_set_resources(struct packet_manager *pm,
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
>>>> index 087e96838997..8ac680dc90f1 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> @@ -883,7 +883,7 @@ struct device_queue_manager
>>>> *device_queue_manager_init(struct kfd_dev *dev);
>>>> void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>>> struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>>> enum kfd_queue_type type);
>>>> -void kernel_queue_uninit(struct kernel_queue *kq);
>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>>> int kfd_process_vm_fault(struct device_queue_manager *dqm,
>>>> unsigned int pasid);
>>>> /* Process Queue Manager */
>>>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>>>> kfd_vi_pm_funcs;
>>>> extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>>> int pm_init(struct packet_manager *pm, struct
>>>> device_queue_manager *dqm);
>>>> -void pm_uninit(struct packet_manager *pm);
>>>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>>> int pm_send_set_resources(struct packet_manager *pm,
>>>> struct scheduling_resources *res);
>>>> int pm_send_runlist(struct packet_manager *pm, struct list_head
>>>> *dqm_queues);
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> index d3eacf72e8db..8fa856e6a03f 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>>>> process_queue_manager *pqm, unsigned int qid)
>>>> /* destroy kernel queue (DIQ) */
>>>> dqm = pqn->kq->dev->dqm;
>>>> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>>> - kernel_queue_uninit(pqn->kq);
>>>> + kernel_queue_uninit(pqn->kq, false);
>>>> }
>>>> if (pqn->q) {
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&reserved=0
>>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
2019-12-20 19:46 ` Felix Kuehling
@ 2019-12-20 22:09 ` shaoyunl
0 siblings, 0 replies; 17+ messages in thread
From: shaoyunl @ 2019-12-20 22:09 UTC (permalink / raw)
To: Felix Kuehling, amd-gfx
I agree this patch is a big improvement , I think we need this patch so
SRIOV can put the amdkfd_pre_reset in right place as bare metal mode .
The further improvement can be in separate change .
This serial is reviewed by shaoyun.liu < shaoyun.liu@amd.com>
Regards
shaoyun.liu
On 2019-12-20 2:46 p.m., Felix Kuehling wrote:
> On 2019-12-20 14:31, shaoyunl wrote:
>> Can we use the dqm_lock when we try to get the dqm->is_hw_hang and
>> dqm->is_resetting inside function kq_uninitialize ?
>
> Spreading the DQM lock around is probably not a good idea. Then I'd
> rather do more refactoring to move hqd_load and hqd_destroy out of the
> kq_init/kq_uninit functions.
>
>
>>
>> I think more closer we check the status to hqd_destroy it will be
>> more accurate . It does look better with this logic if the status are
>> changed after dqm unmap_queue call and before we call hqd_destroy .
>>
>> Another comment in line.
>>
>> Regards
>>
>> shaoyun.liu
>>
>>
>>
>>
>> On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
>>> dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs
>>> outside that lock protection. Therefore I opted to pass in the
>>> hanging flag as a parameter. It also keeps the logic that decides
>>> all of that inside the device queue manager, which I think is cleaner.
>>>
>>> I was trying to clean this up further by moving the
>>> pm_init/pm_uninit out of the start_cpsch/stop_cpsch sequence, but
>>> gave up on that idea when I found out that I can't create the kernel
>>> queue in the DQM initialize function because dev->dqm isn't
>>> initialized at that time yet.
>>>
>>> Regards,
>>> Felix
>>>
>>> On 2019-12-20 10:56, shaoyunl wrote:
>>>> Looks like patch 2 is not related to this serial , but anyway .
>>>>
>>>> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@amd.com>
>>>>
>>>> For patch 4 , is it possible we directly check dqm->is_hws_hang ||
>>>> dqm->is_resetting inside function kq_uninitialize. so we don't
>>>> need other interface change .
>>>>
>>>> I think even Inside that kq_uninitialize function , we still can
>>>> get dqm as kq->dev->dqm .
>>>>
>>>>
>>>> shaoyun.liu
>>>>
>>>>
>>>> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>>>>> Don't use the HWS if it's known to be hanging. In a reset also
>>>>> don't try to destroy the HIQ because that may hang on SRIOV if the
>>>>> KIQ is unresponsive.
>>>>>
>>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>>> ---
>>>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12
>>>>> ++++++++----
>>>>> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
>>>>> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
>>>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
>>>>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
>>>>> 5 files changed, 17 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct
>>>>> device_queue_manager *dqm)
>>>>> static int stop_nocpsch(struct device_queue_manager *dqm)
>>>>> {
>>>>> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>>>> - pm_uninit(&dqm->packets);
>>>>> + pm_uninit(&dqm->packets, false);
>>>>> dqm->sched_running = false;
>>>>> return 0;
>>>>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>>>>> device_queue_manager *dqm)
>>>>> return 0;
>>>>> fail_allocate_vidmem:
>>>>> fail_set_sched_resources:
>>>>> - pm_uninit(&dqm->packets);
>>>>> + pm_uninit(&dqm->packets, false);
>>>>> fail_packet_manager_init:
>>>>> return retval;
>>>>> }
>>>>> static int stop_cpsch(struct device_queue_manager *dqm)
>>>>> {
>>>>> + bool hanging;
>>>>> +kq_uninitialize(
>>>>>
>>>>> dqm_lock(dqm);
>>>>> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>>> + if (!dqm->is_hws_hang)
>> [shaoyunl] should we check is_resetting here as well . so we ignore
>> the unmap call even HWS still not detect the hang but we know we
>> currently in resetting precedure
>
> GPU reset can be done when the HWS is not hanging. In that case
> unmapping queues is perfectly safe. In the worst case it'll time out
> and dqm->is_hws_hang will be set as a result. I'm planning to add more
> checks later so that we can optionally wait in unmap_queues until a
> reset is done. We'll need that to do preemptions reliably while a GPU
> reset is in progress. So I need to either unmap the queues or be sure
> that HWS is hanging.
>
> With yours and Oak's comments I realize, this is far from complete and
> more work is needed. But I still think this is an improvement.
>
> Regards,
> Felix
>
>
>>>>> + unmap_queues_cpsch(dqm,
>>>>> KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>>>> + hanging = dqm->is_hws_hang || dqm->is_resetting;
>>>>> dqm->sched_running = false;
>>>>> dqm_unlock(dqm);
>>>>> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>>>> - pm_uninit(&dqm->packets);
>>>>> + pm_uninit(&dqm->packets, hanging);
>>>>> return 0;
>>>>> }
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>>> index 2d56dc534459..bae706462f96 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>>>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue
>>>>> *kq, struct kfd_dev *dev,
>>>>> }
>>>>> /* Uninitialize a kernel queue and free all its memory usages. */
>>>>> -static void kq_uninitialize(struct kernel_queue *kq)
>>>>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>>>> {
>>>>> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>>>>> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ &&
>>>>> !hanging)
>>>>> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>>>> kq->queue->mqd,
>>>>> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>>>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct
>>>>> kfd_dev *dev,
>>>>> return NULL;
>>>>> }
>>>>> -void kernel_queue_uninit(struct kernel_queue *kq)
>>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>>>> {
>>>>> - kq_uninitialize(kq);
>>>>> + kq_uninitialize(kq, hanging);
>>>>> kfree(kq);
>>>>> }
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>>> index 6cabed06ef5d..dc406e6dee23 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>>>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm,
>>>>> struct device_queue_manager *dqm)
>>>>> return 0;
>>>>> }
>>>>> -void pm_uninit(struct packet_manager *pm)
>>>>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>>>> {
>>>>> mutex_destroy(&pm->lock);
>>>>> - kernel_queue_uninit(pm->priv_queue);
>>>>> + kernel_queue_uninit(pm->priv_queue, hanging);
>>>>> }
>>>>> int pm_send_set_resources(struct packet_manager *pm,
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
>>>>> index 087e96838997..8ac680dc90f1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -883,7 +883,7 @@ struct device_queue_manager
>>>>> *device_queue_manager_init(struct kfd_dev *dev);
>>>>> void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>>>> struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>>>> enum kfd_queue_type type);
>>>>> -void kernel_queue_uninit(struct kernel_queue *kq);
>>>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>>>> int kfd_process_vm_fault(struct device_queue_manager *dqm,
>>>>> unsigned int pasid);
>>>>> /* Process Queue Manager */
>>>>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>>>>> kfd_vi_pm_funcs;
>>>>> extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>>>> int pm_init(struct packet_manager *pm, struct
>>>>> device_queue_manager *dqm);
>>>>> -void pm_uninit(struct packet_manager *pm);
>>>>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>>>> int pm_send_set_resources(struct packet_manager *pm,
>>>>> struct scheduling_resources *res);
>>>>> int pm_send_runlist(struct packet_manager *pm, struct list_head
>>>>> *dqm_queues);
>>>>> diff --git
>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> index d3eacf72e8db..8fa856e6a03f 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>>>>> process_queue_manager *pqm, unsigned int qid)
>>>>> /* destroy kernel queue (DIQ) */
>>>>> dqm = pqn->kq->dev->dqm;
>>>>> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>>>> - kernel_queue_uninit(pqn->kq);
>>>>> + kernel_queue_uninit(pqn->kq, false);
>>>>> }
>>>>> if (pqn->q) {
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&reserved=0
>>>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws
2019-12-20 8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
` (2 preceding siblings ...)
2019-12-20 8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
@ 2020-01-02 13:39 ` Russell, Kent
3 siblings, 0 replies; 17+ messages in thread
From: Russell, Kent @ 2020-01-02 13:39 UTC (permalink / raw)
To: Kuehling, Felix, amd-gfx
[AMD Official Use Only - Internal Distribution Only]
Reviewed-by: Kent Russell <kent.russell@amd.com>
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws
Reading from /sys/kernel/debug/kfd/hang_hws would cause a kernel oops because we didn't implement a read callback. Set the permission to write-only to prevent that.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index 15c523027285..511712c2e382 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -93,7 +93,7 @@ void kfd_debugfs_init(void)
kfd_debugfs_hqds_by_device, &kfd_debugfs_fops);
debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
- debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+ debugfs_create_file("hang_hws", S_IFREG | 0200, debugfs_root,
NULL, &kfd_debugfs_hang_hws_fops); }
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Ckent.russell%40amd.com%7C7dc16d27ccf646dd676608d78526df14%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274407198344&sdata=joT5cPhL9glNqV85jTIqvygQ%2B6CSOprnadM8LzaRBQs%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
* RE: [PATCH 2/4] drm/amdkfd: Remove unused variable
2019-12-20 8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
@ 2020-01-02 13:40 ` Russell, Kent
0 siblings, 0 replies; 17+ messages in thread
From: Russell, Kent @ 2020-01-02 13:40 UTC (permalink / raw)
To: Kuehling, Felix, amd-gfx
[AMD Official Use Only - Internal Distribution Only]
Reviewed-by: Kent Russell <kent.russell@amd.com>
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Friday, December 20, 2019 3:30 AM
To: amd-gfx@lists.freedesktop.org
Subject: [PATCH 2/4] drm/amdkfd: Remove unused variable
dqm->pipeline_mem wasn't used anywhere.
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 1 -
2 files changed, 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f7f6df40875e..558c0ad81848 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -930,7 +930,6 @@ static void uninitialize(struct device_queue_manager *dqm)
for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
kfree(dqm->mqd_mgrs[i]);
mutex_destroy(&dqm->lock_hidden);
- kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
}
static int start_nocpsch(struct device_queue_manager *dqm) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a8c37e6da027..8991120c4fa2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -190,7 +190,6 @@ struct device_queue_manager {
/* the pasid mapping for each kfd vmid */
uint16_t vmid_pasid[VMID_NUM];
uint64_t pipelines_addr;
- struct kfd_mem_obj *pipeline_mem;
uint64_t fence_gpu_addr;
unsigned int *fence_addr;
struct kfd_mem_obj *fence_mem;
--
2.24.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Ckent.russell%40amd.com%7C1c4037940973425f605808d78526de75%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124274380882459&sdata=0SF%2F%2FKC%2BEOK%2FzFNDwQBltE5%2F9euhi7IrbKNPp8dlDrQ%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 17+ messages in thread
end of thread, other threads:[~2020-01-02 13:54 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-20 8:29 [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Felix Kuehling
2019-12-20 8:29 ` [PATCH 2/4] drm/amdkfd: Remove unused variable Felix Kuehling
2020-01-02 13:40 ` Russell, Kent
2019-12-20 8:30 ` [PATCH 3/4] drm/amdkfd: Improve HWS hang detection and handling Felix Kuehling
2019-12-20 17:18 ` Zeng, Oak
2019-12-20 17:49 ` Felix Kuehling
2019-12-20 8:30 ` [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch Felix Kuehling
2019-12-20 10:03 ` Deng, Emily
2019-12-20 15:56 ` shaoyunl
2019-12-20 16:33 ` Felix Kuehling
2019-12-20 19:31 ` shaoyunl
2019-12-20 19:46 ` Felix Kuehling
2019-12-20 22:09 ` shaoyunl
2019-12-20 17:22 ` Zeng, Oak
2019-12-20 17:27 ` Felix Kuehling
2019-12-20 17:29 ` Zeng, Oak
2020-01-02 13:39 ` [PATCH 1/4] drm/amdkfd: Fix permissions of hang_hws Russell, Kent
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.