All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] drm/amdkfd: Fix some double free when destroy queue fails
@ 2021-06-18  6:02 xinhui pan
  2021-06-18 21:02 ` Felix Kuehling
  0 siblings, 1 reply; 2+ messages in thread
From: xinhui pan @ 2021-06-18  6:02 UTC (permalink / raw)
  To: amd-gfx; +Cc: alexander.deucher, Felix.Kuehling, xinhui pan, christian.koenig

Handle queue destroy failure while CP hang.
Once CP got hang, kfd trigger GPU reset and set related flags to stop
driver touching the queue. But all usermode queues have stopped, we
actually somehow succeed to remove queue from runlist. So lets do
cleanup work as normal in -EIO/-ETIME case.

For other fatal error cases, say ENOMEM, we have to skip cleanup this
time as queue might still be running. Regardless user-space tries to
destroy the queue again or not. We need put queue back to the list so
process termination would do the cleanup work. What's more, if userspace
tries to destroy the queue again, we would not free its resource twice.

Paste some error log below without this patch.

amdgpu: Can't create new usermode queue because -1 queues were already
created

refcount_t: underflow; use-after-free.
Call Trace:
 kobject_put+0xe6/0x1b0
 kfd_procfs_del_queue+0x37/0x50 [amdgpu]
 pqm_destroy_queue+0x17a/0x390 [amdgpu]
 kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
 kfd_ioctl+0x463/0x690 [amdgpu]

BUG kmalloc-32 (Tainted: G        W        ): Object already free
INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
 __slab_alloc+0x72/0x80
 kmem_cache_alloc_trace+0x81f/0x8c0
 allocate_sdma_mqd+0x30/0xb0 [amdgpu]
 create_queue_cpsch+0xbf/0x470 [amdgpu]
 pqm_create_queue+0x28d/0x6d0 [amdgpu]
 kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
 kfree+0x322/0x340
 free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
 destroy_queue_cpsch+0x20c/0x330 [amdgpu]
 pqm_destroy_queue+0x1a3/0x390 [amdgpu]
 kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 15 ++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c          |  4 +++-
 .../drm/amd/amdkfd/kfd_process_queue_manager.c    |  4 +++-
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 16a1713808c2..f38f011e6f97 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1528,8 +1528,13 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 		decrement_queue_count(dqm, q->properties.type);
 		retval = execute_queues_cpsch(dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
-		if (retval == -ETIME)
+		/* CP hang and all usermode queues have stopped.
+		 * We are safe to do the cleanup work.
+		 */
+		if (retval == -ETIME || retval == -EIO)
 			qpd->reset_wavefronts = true;
+		else if (retval)
+			goto failed_execute_queue;
 		if (q->properties.is_gws) {
 			dqm->gws_queue_count--;
 			qpd->mapped_gws_queue = false;
@@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	return retval;
 
+failed_execute_queue:
+	/* Put queue back to the list, then we have chance to destroy it.
+	 * FIXME: we do NOT want the queue in the runlist again.
+	 */
+	list_add(&q->list, &qpd->queues_list);
+	qpd->queue_count++;
+	if (q->properties.is_active)
+		increment_queue_count(dqm, q->properties.type);
 failed_try_destroy_debugged_queue:
 
 	dqm_unlock(dqm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 09b98a83f670..984197e5929f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
 
 void kfd_procfs_del_queue(struct queue *q)
 {
-	if (!q)
+	if (!q || !kobject_get_unless_zero(&q->kobj))
 		return;
 
 	kobject_del(&q->kobj);
 	kobject_put(&q->kobj);
+	/* paired with the get above */
+	kobject_put(&q->kobj);
 }
 
 int kfd_process_create_wq(void)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 95a6c36cea4c..c796c7601365 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		dqm = pqn->kq->dev->dqm;
 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
 		kernel_queue_uninit(pqn->kq, false);
+		pqn->kq = NULL;
 	}
 
 	if (pqn->q) {
@@ -383,7 +384,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 			pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n",
 				pqm->process->pasid,
 				pqn->q->properties.queue_id, retval);
-			if (retval != -ETIME)
+			if (retval != -ETIME && retval != -EIO)
 				goto err_destroy_queue;
 		}
 
@@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		kfree(pqn->q->properties.cu_mask);
 		pqn->q->properties.cu_mask = NULL;
 		uninit_queue(pqn->q);
+		pqn->q = NULL;
 	}
 
 	list_del(&pqn->process_queue_list);
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v3] drm/amdkfd: Fix some double free when destroy queue fails
  2021-06-18  6:02 [PATCH v3] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
@ 2021-06-18 21:02 ` Felix Kuehling
  0 siblings, 0 replies; 2+ messages in thread
From: Felix Kuehling @ 2021-06-18 21:02 UTC (permalink / raw)
  To: xinhui pan, amd-gfx; +Cc: alexander.deucher, christian.koenig

Am 2021-06-18 um 2:02 a.m. schrieb xinhui pan:
> Handle queue destroy failure while CP hang.
> Once CP got hang, kfd trigger GPU reset and set related flags to stop
> driver touching the queue. But all usermode queues have stopped, we
> actually somehow succeed to remove queue from runlist. So lets do
> cleanup work as normal in -EIO/-ETIME case.
>
> For other fatal error cases, say ENOMEM, we have to skip cleanup this
> time as queue might still be running. Regardless user-space tries to
> destroy the queue again or not. We need put queue back to the list so
> process termination would do the cleanup work. What's more, if userspace
> tries to destroy the queue again, we would not free its resource twice.
>
> Paste some error log below without this patch.
>
> amdgpu: Can't create new usermode queue because -1 queues were already
> created
>
> refcount_t: underflow; use-after-free.
> Call Trace:
>  kobject_put+0xe6/0x1b0
>  kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>  pqm_destroy_queue+0x17a/0x390 [amdgpu]
>  kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>  kfd_ioctl+0x463/0x690 [amdgpu]
>
> BUG kmalloc-32 (Tainted: G        W        ): Object already free
> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>  __slab_alloc+0x72/0x80
>  kmem_cache_alloc_trace+0x81f/0x8c0
>  allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>  create_queue_cpsch+0xbf/0x470 [amdgpu]
>  pqm_create_queue+0x28d/0x6d0 [amdgpu]
>  kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>  kfree+0x322/0x340
>  free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>  destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>  pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>  kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 15 ++++++++++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c          |  4 +++-
>  .../drm/amd/amdkfd/kfd_process_queue_manager.c    |  4 +++-
>  3 files changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 16a1713808c2..f38f011e6f97 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1528,8 +1528,13 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  		decrement_queue_count(dqm, q->properties.type);
>  		retval = execute_queues_cpsch(dqm,
>  				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> -		if (retval == -ETIME)
> +		/* CP hang and all usermode queues have stopped.
> +		 * We are safe to do the cleanup work.
> +		 */
> +		if (retval == -ETIME || retval == -EIO)
>  			qpd->reset_wavefronts = true;
> +		else if (retval)
> +			goto failed_execute_queue;
>  		if (q->properties.is_gws) {
>  			dqm->gws_queue_count--;
>  			qpd->mapped_gws_queue = false;
> @@ -1551,6 +1556,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>  
>  	return retval;
>  
> +failed_execute_queue:
> +	/* Put queue back to the list, then we have chance to destroy it.
> +	 * FIXME: we do NOT want the queue in the runlist again.
> +	 */
> +	list_add(&q->list, &qpd->queues_list);
> +	qpd->queue_count++;
> +	if (q->properties.is_active)
> +		increment_queue_count(dqm, q->properties.type);

You should be able to keep the queue off the runlist by disabling it.
You can do that e.g. by setting q->properties.queue_address = NULL and
q->proterties.is_active = false. You'd have to update
dqm->gws_queue_count and qpd->mapped_gws_queues manually, or move that
change before the execute_queues_cpsch above.

Regards,
  Felix


>  failed_try_destroy_debugged_queue:
>  
>  	dqm_unlock(dqm);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 09b98a83f670..984197e5929f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>  
>  void kfd_procfs_del_queue(struct queue *q)
>  {
> -	if (!q)
> +	if (!q || !kobject_get_unless_zero(&q->kobj))
>  		return;
>  
>  	kobject_del(&q->kobj);
>  	kobject_put(&q->kobj);
> +	/* paired with the get above */
> +	kobject_put(&q->kobj);
>  }
>  
>  int kfd_process_create_wq(void)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 95a6c36cea4c..c796c7601365 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>  		dqm = pqn->kq->dev->dqm;
>  		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>  		kernel_queue_uninit(pqn->kq, false);
> +		pqn->kq = NULL;
>  	}
>  
>  	if (pqn->q) {
> @@ -383,7 +384,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>  			pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n",
>  				pqm->process->pasid,
>  				pqn->q->properties.queue_id, retval);
> -			if (retval != -ETIME)
> +			if (retval != -ETIME && retval != -EIO)
>  				goto err_destroy_queue;
>  		}
>  
> @@ -396,6 +397,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>  		kfree(pqn->q->properties.cu_mask);
>  		pqn->q->properties.cu_mask = NULL;
>  		uninit_queue(pqn->q);
> +		pqn->q = NULL;
>  	}
>  
>  	list_del(&pqn->process_queue_list);
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-06-18 21:02 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-18  6:02 [PATCH v3] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
2021-06-18 21:02 ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.