All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails
@ 2021-06-17 12:02 xinhui pan
  2021-06-17 12:02 ` [PATCH v2 2/2] drm/amdkfd: Walk through list with dqm lock hold xinhui pan
  2021-06-17 12:11 ` [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails Pan, Xinhui
  0 siblings, 2 replies; 6+ messages in thread
From: xinhui pan @ 2021-06-17 12:02 UTC (permalink / raw)
  To: amd-gfx; +Cc: alexander.deucher, Felix.Kuehling, xinhui pan, christian.koenig

Handle queue destroy failure while CP hang.
Once CP got hang, kfd trigger GPU reset and set related flags to stop
driver touching the queue. As we leave the queue as it is, we need keep
the resource as it is too.

Regardless user-space tries to destroy the queue again or not. We need
put queue back to the list so process termination would do the cleanup
work. What's more, if userspace tries to destroy the queue again, we
would not free its resource twice.

Kfd return -EIO in this case, so lets handle it now.

Paste some error log below without this patch.

amdgpu: Can't create new usermode queue because -1 queues were already
created

refcount_t: underflow; use-after-free.
Call Trace:
 kobject_put+0xe6/0x1b0
 kfd_procfs_del_queue+0x37/0x50 [amdgpu]
 pqm_destroy_queue+0x17a/0x390 [amdgpu]
 kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
 kfd_ioctl+0x463/0x690 [amdgpu]

BUG kmalloc-32 (Tainted: G        W        ): Object already free
INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
pid=2511
 __slab_alloc+0x72/0x80
 kmem_cache_alloc_trace+0x81f/0x8c0
 allocate_sdma_mqd+0x30/0xb0 [amdgpu]
 create_queue_cpsch+0xbf/0x470 [amdgpu]
 pqm_create_queue+0x28d/0x6d0 [amdgpu]
 kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
pid=2511
 kfree+0x322/0x340
 free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
 destroy_queue_cpsch+0x20c/0x330 [amdgpu]
 pqm_destroy_queue+0x1a3/0x390 [amdgpu]
 kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 +++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c            |  4 +++-
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c  |  2 ++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c069fa259b30..63a9a19a3987 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1530,6 +1530,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 		if (retval == -ETIME)
 			qpd->reset_wavefronts = true;
+		/* In gpu reset? We leave the queue as it is, so do NOT
+		 * cleanup the resource.
+		 */
+		else if (retval == -EIO)
+			goto failed_execute_queue;
 		if (q->properties.is_gws) {
 			dqm->gws_queue_count--;
 			qpd->mapped_gws_queue = false;
@@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	return retval;
 
+failed_execute_queue:
+	/* Put queue back to the list, then we have chance to destroy it.
+	 * FIXME: we do NOT want the queue in the runlist again.
+	 */
+	list_add(&q->list, &qpd->queues_list);
+	qpd->queue_count++;
+	if (q->properties.is_active)
+		increment_queue_count(dqm, q->properties.type);
 failed_try_destroy_debugged_queue:
 
 	dqm_unlock(dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 09b98a83f670..984197e5929f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
 
 void kfd_procfs_del_queue(struct queue *q)
 {
-	if (!q)
+	if (!q || !kobject_get_unless_zero(&q->kobj))
 		return;
 
 	kobject_del(&q->kobj);
 	kobject_put(&q->kobj);
+	/* paired with the get above */
+	kobject_put(&q->kobj);
 }
 
 int kfd_process_create_wq(void)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 95a6c36cea4c..0588e552b8ec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		dqm = pqn->kq->dev->dqm;
 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
 		kernel_queue_uninit(pqn->kq, false);
+		pqn->kq = NULL;
 	}
 
 	if (pqn->q) {
@@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		kfree(pqn->q->properties.cu_mask);
 		pqn->q->properties.cu_mask = NULL;
 		uninit_queue(pqn->q);
+		pqn->q = NULL;
 	}
 
 	list_del(&pqn->process_queue_list);
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2 2/2] drm/amdkfd: Walk through list with dqm lock hold
  2021-06-17 12:02 [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
@ 2021-06-17 12:02 ` xinhui pan
  2021-06-17 21:52   ` Felix Kuehling
  2021-06-17 12:11 ` [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails Pan, Xinhui
  1 sibling, 1 reply; 6+ messages in thread
From: xinhui pan @ 2021-06-17 12:02 UTC (permalink / raw)
  To: amd-gfx; +Cc: alexander.deucher, Felix.Kuehling, xinhui pan, christian.koenig

To avoid any list corruption.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 63a9a19a3987..d62374746c93 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1722,7 +1722,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd)
 {
 	int retval;
-	struct queue *q, *next;
+	struct queue *q;
 	struct kernel_queue *kq, *kq_next;
 	struct mqd_manager *mqd_mgr;
 	struct device_process_node *cur, *next_dpn;
@@ -1779,24 +1779,26 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 		qpd->reset_wavefronts = false;
 	}
 
-	dqm_unlock(dqm);
-
-	/* Outside the DQM lock because under the DQM lock we can't do
-	 * reclaim or take other locks that others hold while reclaiming.
-	 */
-	if (found)
-		kfd_dec_compute_active(dqm->dev);
-
 	/* Lastly, free mqd resources.
 	 * Do free_mqd() after dqm_unlock to avoid circular locking.
 	 */
-	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+	while (!list_empty(&qpd->queues_list)) {
+		q = list_first_entry(&qpd->queues_list, struct queue, list);
 		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 				q->properties.type)];
 		list_del(&q->list);
 		qpd->queue_count--;
+		dqm_unlock(dqm);
 		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+		dqm_lock(dqm);
 	}
+	dqm_unlock(dqm);
+
+	/* Outside the DQM lock because under the DQM lock we can't do
+	 * reclaim or take other locks that others hold while reclaiming.
+	 */
+	if (found)
+		kfd_dec_compute_active(dqm->dev);
 
 	return retval;
 }
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails
  2021-06-17 12:02 [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
  2021-06-17 12:02 ` [PATCH v2 2/2] drm/amdkfd: Walk through list with dqm lock hold xinhui pan
@ 2021-06-17 12:11 ` Pan, Xinhui
  2021-06-17 12:41   ` Pan, Xinhui
  1 sibling, 1 reply; 6+ messages in thread
From: Pan, Xinhui @ 2021-06-17 12:11 UTC (permalink / raw)
  To: Pan, Xinhui
  Cc: Deucher, Alexander, Kuehling, Felix, Pan, Xinhui, Koenig,
	Christian, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 5310 bytes --]

Felix
what I am thinking of like below looks like more simple. :)

@@ -1501,6 +1501,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
        /* remove queue from list to prevent rescheduling after preemption */
        dqm_lock(dqm);
 
+       if (dqm->is_hws_hang) {
+               retval = -EIO;
+               goto failed_try_destroy_debugged_queue;
+       }
+
        if (qpd->is_debug) {
                /*
                 * error, currently we do not allow to destroy a queue

> 2021年6月17日 20:02,Pan, Xinhui <Xinhui.Pan@amd.com> 写道:
> 
> Handle queue destroy failure while CP hang.
> Once CP got hang, kfd trigger GPU reset and set related flags to stop
> driver touching the queue. As we leave the queue as it is, we need keep
> the resource as it is too.
> 
> Regardless user-space tries to destroy the queue again or not. We need
> put queue back to the list so process termination would do the cleanup
> work. What's more, if userspace tries to destroy the queue again, we
> would not free its resource twice.
> 
> Kfd return -EIO in this case, so lets handle it now.
> 
> Paste some error log below without this patch.
> 
> amdgpu: Can't create new usermode queue because -1 queues were already
> created
> 
> refcount_t: underflow; use-after-free.
> Call Trace:
> kobject_put+0xe6/0x1b0
> kfd_procfs_del_queue+0x37/0x50 [amdgpu]
> pqm_destroy_queue+0x17a/0x390 [amdgpu]
> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
> kfd_ioctl+0x463/0x690 [amdgpu]
> 
> BUG kmalloc-32 (Tainted: G        W        ): Object already free
> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
> pid=2511
> __slab_alloc+0x72/0x80
> kmem_cache_alloc_trace+0x81f/0x8c0
> allocate_sdma_mqd+0x30/0xb0 [amdgpu]
> create_queue_cpsch+0xbf/0x470 [amdgpu]
> pqm_create_queue+0x28d/0x6d0 [amdgpu]
> kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
> pid=2511
> kfree+0x322/0x340
> free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
> destroy_queue_cpsch+0x20c/0x330 [amdgpu]
> pqm_destroy_queue+0x1a3/0x390 [amdgpu]
> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
> 
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 +++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_process.c            |  4 +++-
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c  |  2 ++
> 3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c069fa259b30..63a9a19a3987 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1530,6 +1530,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
> 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> 		if (retval == -ETIME)
> 			qpd->reset_wavefronts = true;
> +		/* In gpu reset? We leave the queue as it is, so do NOT
> +		 * cleanup the resource.
> +		 */
> +		else if (retval == -EIO)
> +			goto failed_execute_queue;
> 		if (q->properties.is_gws) {
> 			dqm->gws_queue_count--;
> 			qpd->mapped_gws_queue = false;
> @@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
> 
> 	return retval;
> 
> +failed_execute_queue:
> +	/* Put queue back to the list, then we have chance to destroy it.
> +	 * FIXME: we do NOT want the queue in the runlist again.
> +	 */
> +	list_add(&q->list, &qpd->queues_list);
> +	qpd->queue_count++;
> +	if (q->properties.is_active)
> +		increment_queue_count(dqm, q->properties.type);
> failed_try_destroy_debugged_queue:
> 
> 	dqm_unlock(dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 09b98a83f670..984197e5929f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
> 
> void kfd_procfs_del_queue(struct queue *q)
> {
> -	if (!q)
> +	if (!q || !kobject_get_unless_zero(&q->kobj))
> 		return;
> 
> 	kobject_del(&q->kobj);
> 	kobject_put(&q->kobj);
> +	/* paired with the get above */
> +	kobject_put(&q->kobj);
> }
> 
> int kfd_process_create_wq(void)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 95a6c36cea4c..0588e552b8ec 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> 		dqm = pqn->kq->dev->dqm;
> 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> 		kernel_queue_uninit(pqn->kq, false);
> +		pqn->kq = NULL;
> 	}
> 
> 	if (pqn->q) {
> @@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> 		kfree(pqn->q->properties.cu_mask);
> 		pqn->q->properties.cu_mask = NULL;
> 		uninit_queue(pqn->q);
> +		pqn->q = NULL;
> 	}
> 
> 	list_del(&pqn->process_queue_list);
> -- 
> 2.25.1
> 


[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 15085 bytes --]

[-- Attachment #3: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails
  2021-06-17 12:11 ` [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails Pan, Xinhui
@ 2021-06-17 12:41   ` Pan, Xinhui
  2021-06-17 21:52     ` Felix Kuehling
  0 siblings, 1 reply; 6+ messages in thread
From: Pan, Xinhui @ 2021-06-17 12:41 UTC (permalink / raw)
  To: Pan, Xinhui
  Cc: Deucher, Alexander, Kuehling, Felix, Pan, Xinhui, Koenig,
	Christian, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 5726 bytes --]

Felix
What I am wondreing is that if CP got hang,  could we assume all usermode queues have stopped?
If so, we can do cleanupwork regardless of the retval of execute_queues_cpsch().

> 2021年6月17日 20:11,Pan, Xinhui <Xinhui.Pan@amd.com> 写道:
> 
> Felix
> what I am thinking of like below looks like more simple. :)
> 
> @@ -1501,6 +1501,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>        /* remove queue from list to prevent rescheduling after preemption */
>        dqm_lock(dqm);
> 
> +       if (dqm->is_hws_hang) {
> +               retval = -EIO;
> +               goto failed_try_destroy_debugged_queue;
> +       }
> +
>        if (qpd->is_debug) {
>                /*
>                 * error, currently we do not allow to destroy a queue
> 
>> 2021年6月17日 20:02,Pan, Xinhui <Xinhui.Pan@amd.com> 写道:
>> 
>> Handle queue destroy failure while CP hang.
>> Once CP got hang, kfd trigger GPU reset and set related flags to stop
>> driver touching the queue. As we leave the queue as it is, we need keep
>> the resource as it is too.
>> 
>> Regardless user-space tries to destroy the queue again or not. We need
>> put queue back to the list so process termination would do the cleanup
>> work. What's more, if userspace tries to destroy the queue again, we
>> would not free its resource twice.
>> 
>> Kfd return -EIO in this case, so lets handle it now.
>> 
>> Paste some error log below without this patch.
>> 
>> amdgpu: Can't create new usermode queue because -1 queues were already
>> created
>> 
>> refcount_t: underflow; use-after-free.
>> Call Trace:
>> kobject_put+0xe6/0x1b0
>> kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>> pqm_destroy_queue+0x17a/0x390 [amdgpu]
>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>> kfd_ioctl+0x463/0x690 [amdgpu]
>> 
>> BUG kmalloc-32 (Tainted: G        W        ): Object already free
>> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>> pid=2511
>> __slab_alloc+0x72/0x80
>> kmem_cache_alloc_trace+0x81f/0x8c0
>> allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>> create_queue_cpsch+0xbf/0x470 [amdgpu]
>> pqm_create_queue+0x28d/0x6d0 [amdgpu]
>> kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
>> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>> pid=2511
>> kfree+0x322/0x340
>> free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>> destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>> pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>> 
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 +++++++++++++
>> drivers/gpu/drm/amd/amdkfd/kfd_process.c            |  4 +++-
>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c  |  2 ++
>> 3 files changed, 18 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index c069fa259b30..63a9a19a3987 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -1530,6 +1530,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>> 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>> 		if (retval == -ETIME)
>> 			qpd->reset_wavefronts = true;
>> +		/* In gpu reset? We leave the queue as it is, so do NOT
>> +		 * cleanup the resource.
>> +		 */
>> +		else if (retval == -EIO)
>> +			goto failed_execute_queue;
>> 		if (q->properties.is_gws) {
>> 			dqm->gws_queue_count--;
>> 			qpd->mapped_gws_queue = false;
>> @@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>> 
>> 	return retval;
>> 
>> +failed_execute_queue:
>> +	/* Put queue back to the list, then we have chance to destroy it.
>> +	 * FIXME: we do NOT want the queue in the runlist again.
>> +	 */
>> +	list_add(&q->list, &qpd->queues_list);
>> +	qpd->queue_count++;
>> +	if (q->properties.is_active)
>> +		increment_queue_count(dqm, q->properties.type);
>> failed_try_destroy_debugged_queue:
>> 
>> 	dqm_unlock(dqm);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 09b98a83f670..984197e5929f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>> 
>> void kfd_procfs_del_queue(struct queue *q)
>> {
>> -	if (!q)
>> +	if (!q || !kobject_get_unless_zero(&q->kobj))
>> 		return;
>> 
>> 	kobject_del(&q->kobj);
>> 	kobject_put(&q->kobj);
>> +	/* paired with the get above */
>> +	kobject_put(&q->kobj);
>> }
>> 
>> int kfd_process_create_wq(void)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index 95a6c36cea4c..0588e552b8ec 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>> 		dqm = pqn->kq->dev->dqm;
>> 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>> 		kernel_queue_uninit(pqn->kq, false);
>> +		pqn->kq = NULL;
>> 	}
>> 
>> 	if (pqn->q) {
>> @@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>> 		kfree(pqn->q->properties.cu_mask);
>> 		pqn->q->properties.cu_mask = NULL;
>> 		uninit_queue(pqn->q);
>> +		pqn->q = NULL;
>> 	}
>> 
>> 	list_del(&pqn->process_queue_list);
>> -- 
>> 2.25.1
>> 
> 


[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 15321 bytes --]

[-- Attachment #3: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails
  2021-06-17 12:41   ` Pan, Xinhui
@ 2021-06-17 21:52     ` Felix Kuehling
  0 siblings, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2021-06-17 21:52 UTC (permalink / raw)
  To: Pan, Xinhui; +Cc: Deucher, Alexander, Koenig, Christian, amd-gfx

Am 2021-06-17 um 8:41 a.m. schrieb Pan, Xinhui:
> Felix
> What I am wondreing is that if CP got hang,  could we assume all usermode queues have stopped?
> If so, we can do cleanupwork regardless of the retval of execute_queues_cpsch().

Right. That's what we currently do with ETIME, which happens when the
hang is first detected. I don't know why we need to treat the case
differently when we're already in a reset.

Regards,
  Felix


>
>> 2021年6月17日 20:11,Pan, Xinhui <Xinhui.Pan@amd.com> 写道:
>>
>> Felix
>> what I am thinking of like below looks like more simple. :)
>>
>> @@ -1501,6 +1501,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>        /* remove queue from list to prevent rescheduling after preemption */
>>        dqm_lock(dqm);
>>
>> +       if (dqm->is_hws_hang) {
>> +               retval = -EIO;
>> +               goto failed_try_destroy_debugged_queue;
>> +       }
>> +
>>        if (qpd->is_debug) {
>>                /*
>>                 * error, currently we do not allow to destroy a queue
>>
>>> 2021年6月17日 20:02,Pan, Xinhui <Xinhui.Pan@amd.com> 写道:
>>>
>>> Handle queue destroy failure while CP hang.
>>> Once CP got hang, kfd trigger GPU reset and set related flags to stop
>>> driver touching the queue. As we leave the queue as it is, we need keep
>>> the resource as it is too.
>>>
>>> Regardless user-space tries to destroy the queue again or not. We need
>>> put queue back to the list so process termination would do the cleanup
>>> work. What's more, if userspace tries to destroy the queue again, we
>>> would not free its resource twice.
>>>
>>> Kfd return -EIO in this case, so lets handle it now.
>>>
>>> Paste some error log below without this patch.
>>>
>>> amdgpu: Can't create new usermode queue because -1 queues were already
>>> created
>>>
>>> refcount_t: underflow; use-after-free.
>>> Call Trace:
>>> kobject_put+0xe6/0x1b0
>>> kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>>> pqm_destroy_queue+0x17a/0x390 [amdgpu]
>>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>> kfd_ioctl+0x463/0x690 [amdgpu]
>>>
>>> BUG kmalloc-32 (Tainted: G        W        ): Object already free
>>> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>>> pid=2511
>>> __slab_alloc+0x72/0x80
>>> kmem_cache_alloc_trace+0x81f/0x8c0
>>> allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>>> create_queue_cpsch+0xbf/0x470 [amdgpu]
>>> pqm_create_queue+0x28d/0x6d0 [amdgpu]
>>> kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
>>> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>>> pid=2511
>>> kfree+0x322/0x340
>>> free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>>> destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>>> pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 +++++++++++++
>>> drivers/gpu/drm/amd/amdkfd/kfd_process.c            |  4 +++-
>>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c  |  2 ++
>>> 3 files changed, 18 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index c069fa259b30..63a9a19a3987 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -1530,6 +1530,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>> 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>> 		if (retval == -ETIME)
>>> 			qpd->reset_wavefronts = true;
>>> +		/* In gpu reset? We leave the queue as it is, so do NOT
>>> +		 * cleanup the resource.
>>> +		 */
>>> +		else if (retval == -EIO)
>>> +			goto failed_execute_queue;
>>> 		if (q->properties.is_gws) {
>>> 			dqm->gws_queue_count--;
>>> 			qpd->mapped_gws_queue = false;
>>> @@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>>
>>> 	return retval;
>>>
>>> +failed_execute_queue:
>>> +	/* Put queue back to the list, then we have chance to destroy it.
>>> +	 * FIXME: we do NOT want the queue in the runlist again.
>>> +	 */
>>> +	list_add(&q->list, &qpd->queues_list);
>>> +	qpd->queue_count++;
>>> +	if (q->properties.is_active)
>>> +		increment_queue_count(dqm, q->properties.type);
>>> failed_try_destroy_debugged_queue:
>>>
>>> 	dqm_unlock(dqm);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 09b98a83f670..984197e5929f 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>>>
>>> void kfd_procfs_del_queue(struct queue *q)
>>> {
>>> -	if (!q)
>>> +	if (!q || !kobject_get_unless_zero(&q->kobj))
>>> 		return;
>>>
>>> 	kobject_del(&q->kobj);
>>> 	kobject_put(&q->kobj);
>>> +	/* paired with the get above */
>>> +	kobject_put(&q->kobj);
>>> }
>>>
>>> int kfd_process_create_wq(void)
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 95a6c36cea4c..0588e552b8ec 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>> 		dqm = pqn->kq->dev->dqm;
>>> 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>> 		kernel_queue_uninit(pqn->kq, false);
>>> +		pqn->kq = NULL;
>>> 	}
>>>
>>> 	if (pqn->q) {
>>> @@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>> 		kfree(pqn->q->properties.cu_mask);
>>> 		pqn->q->properties.cu_mask = NULL;
>>> 		uninit_queue(pqn->q);
>>> +		pqn->q = NULL;
>>> 	}
>>>
>>> 	list_del(&pqn->process_queue_list);
>>> -- 
>>> 2.25.1
>>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 2/2] drm/amdkfd: Walk through list with dqm lock hold
  2021-06-17 12:02 ` [PATCH v2 2/2] drm/amdkfd: Walk through list with dqm lock hold xinhui pan
@ 2021-06-17 21:52   ` Felix Kuehling
  0 siblings, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2021-06-17 21:52 UTC (permalink / raw)
  To: xinhui pan, amd-gfx; +Cc: alexander.deucher, christian.koenig

Am 2021-06-17 um 8:02 a.m. schrieb xinhui pan:
> To avoid any list corruption.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>

This patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>


> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 22 ++++++++++---------
>  1 file changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 63a9a19a3987..d62374746c93 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1722,7 +1722,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>  		struct qcm_process_device *qpd)
>  {
>  	int retval;
> -	struct queue *q, *next;
> +	struct queue *q;
>  	struct kernel_queue *kq, *kq_next;
>  	struct mqd_manager *mqd_mgr;
>  	struct device_process_node *cur, *next_dpn;
> @@ -1779,24 +1779,26 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>  		qpd->reset_wavefronts = false;
>  	}
>  
> -	dqm_unlock(dqm);
> -
> -	/* Outside the DQM lock because under the DQM lock we can't do
> -	 * reclaim or take other locks that others hold while reclaiming.
> -	 */
> -	if (found)
> -		kfd_dec_compute_active(dqm->dev);
> -
>  	/* Lastly, free mqd resources.
>  	 * Do free_mqd() after dqm_unlock to avoid circular locking.
>  	 */
> -	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
> +	while (!list_empty(&qpd->queues_list)) {
> +		q = list_first_entry(&qpd->queues_list, struct queue, list);
>  		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>  				q->properties.type)];
>  		list_del(&q->list);
>  		qpd->queue_count--;
> +		dqm_unlock(dqm);
>  		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +		dqm_lock(dqm);
>  	}
> +	dqm_unlock(dqm);
> +
> +	/* Outside the DQM lock because under the DQM lock we can't do
> +	 * reclaim or take other locks that others hold while reclaiming.
> +	 */
> +	if (found)
> +		kfd_dec_compute_active(dqm->dev);
>  
>  	return retval;
>  }
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-06-17 21:52 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-17 12:02 [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
2021-06-17 12:02 ` [PATCH v2 2/2] drm/amdkfd: Walk through list with dqm lock hold xinhui pan
2021-06-17 21:52   ` Felix Kuehling
2021-06-17 12:11 ` [PATCH v2 1/2] drm/amdkfd: Fix some double free when destroy queue fails Pan, Xinhui
2021-06-17 12:41   ` Pan, Xinhui
2021-06-17 21:52     ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.