All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
@ 2021-09-29  8:22 Lang Yu
  2021-09-29 15:24 ` Felix Kuehling
  0 siblings, 1 reply; 9+ messages in thread
From: Lang Yu @ 2021-09-29  8:22 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx; +Cc: Alex Deucher, Huang Rui, Lang Yu

If user doesn't explicitly call kfd_ioctl_destroy_queue
to destroy all created queues, when the kfd process is
destroyed, some queues' cu_mask memory are not freed.

To avoid forgetting to free them in some places,
free them immediately after use.

Signed-off-by: Lang Yu <lang.yu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 ++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4de907f3e66a..5c0e6dcf692a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
 	retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size);
 	if (retval) {
 		pr_debug("Could not copy CU mask from userspace");
-		kfree(properties.cu_mask);
-		return -EFAULT;
+		retval = -EFAULT;
+		goto out;
 	}
 
 	mutex_lock(&p->mutex);
@@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
 
 	mutex_unlock(&p->mutex);
 
-	if (retval)
-		kfree(properties.cu_mask);
+out:
+	kfree(properties.cu_mask);
 
 	return retval;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 243dd1efcdbf..4c81d690f31a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -394,8 +394,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 			pdd->qpd.num_gws = 0;
 		}
 
-		kfree(pqn->q->properties.cu_mask);
-		pqn->q->properties.cu_mask = NULL;
 		uninit_queue(pqn->q);
 	}
 
@@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
 		return -EFAULT;
 	}
 
-	/* Free the old CU mask memory if it is already allocated, then
-	 * allocate memory for the new CU mask.
-	 */
-	kfree(pqn->q->properties.cu_mask);
+	WARN_ON_ONCE(pqn->q->properties.cu_mask);
 
 	pqn->q->properties.cu_mask_count = p->cu_mask_count;
 	pqn->q->properties.cu_mask = p->cu_mask;
 
 	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
 							pqn->q);
+
+	pqn->q->properties.cu_mask = NULL;
+
 	if (retval != 0)
 		return retval;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-29  8:22 [PATCH] drm/amdkfd: fix a potential cu_mask memory leak Lang Yu
@ 2021-09-29 15:24 ` Felix Kuehling
  2021-09-29 23:32   ` Yu, Lang
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Kuehling @ 2021-09-29 15:24 UTC (permalink / raw)
  To: Lang Yu, amd-gfx; +Cc: Alex Deucher, Huang Rui

Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
> If user doesn't explicitly call kfd_ioctl_destroy_queue
> to destroy all created queues, when the kfd process is
> destroyed, some queues' cu_mask memory are not freed.
>
> To avoid forgetting to free them in some places,
> free them immediately after use.
>
> Signed-off-by: Lang Yu <lang.yu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10 ++++------
>  2 files changed, 8 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 4de907f3e66a..5c0e6dcf692a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
>  	retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size);
>  	if (retval) {
>  		pr_debug("Could not copy CU mask from userspace");
> -		kfree(properties.cu_mask);
> -		return -EFAULT;
> +		retval = -EFAULT;
> +		goto out;
>  	}
>  
>  	mutex_lock(&p->mutex);
> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
>  
>  	mutex_unlock(&p->mutex);
>  
> -	if (retval)
> -		kfree(properties.cu_mask);
> +out:
> +	kfree(properties.cu_mask);
>  
>  	return retval;
>  }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 243dd1efcdbf..4c81d690f31a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>  			pdd->qpd.num_gws = 0;
>  		}
>  
> -		kfree(pqn->q->properties.cu_mask);
> -		pqn->q->properties.cu_mask = NULL;
>  		uninit_queue(pqn->q);
>  	}
>  
> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
>  		return -EFAULT;
>  	}
>  
> -	/* Free the old CU mask memory if it is already allocated, then
> -	 * allocate memory for the new CU mask.
> -	 */
> -	kfree(pqn->q->properties.cu_mask);
> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>  
>  	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>  	pqn->q->properties.cu_mask = p->cu_mask;
>  
>  	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
>  							pqn->q);
> +
> +	pqn->q->properties.cu_mask = NULL;
> +

This won't work correctly. We need to save the cu_mask for later.
Otherwise the next time dqm->ops.update_queue is called, for example in
pqm_update_queue or pqm_set_gws, it will wipe out the CU mask in the MQD.

Regards,
  Felix


>  	if (retval != 0)
>  		return retval;
>  

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-29 15:24 ` Felix Kuehling
@ 2021-09-29 23:32   ` Yu, Lang
  2021-09-30  1:47     ` Felix Kuehling
  0 siblings, 1 reply; 9+ messages in thread
From: Yu, Lang @ 2021-09-29 23:32 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray

[AMD Official Use Only]



>-----Original Message-----
>From: Kuehling, Felix <Felix.Kuehling@amd.com>
>Sent: Wednesday, September 29, 2021 11:25 PM
>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy all
>> created queues, when the kfd process is destroyed, some queues'
>> cu_mask memory are not freed.
>>
>> To avoid forgetting to free them in some places, free them immediately
>> after use.
>>
>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>>  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>> ++++------
>>  2 files changed, 8 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 4de907f3e66a..5c0e6dcf692a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct
>kfd_process *p,
>>  	retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>cu_mask_size);
>>  	if (retval) {
>>  		pr_debug("Could not copy CU mask from userspace");
>> -		kfree(properties.cu_mask);
>> -		return -EFAULT;
>> +		retval = -EFAULT;
>> +		goto out;
>>  	}
>>
>>  	mutex_lock(&p->mutex);
>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>> *filp, struct kfd_process *p,
>>
>>  	mutex_unlock(&p->mutex);
>>
>> -	if (retval)
>> -		kfree(properties.cu_mask);
>> +out:
>> +	kfree(properties.cu_mask);
>>
>>  	return retval;
>>  }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index 243dd1efcdbf..4c81d690f31a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>process_queue_manager *pqm, unsigned int qid)
>>  			pdd->qpd.num_gws = 0;
>>  		}
>>
>> -		kfree(pqn->q->properties.cu_mask);
>> -		pqn->q->properties.cu_mask = NULL;
>>  		uninit_queue(pqn->q);
>>  	}
>>
>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>process_queue_manager *pqm, unsigned int qid,
>>  		return -EFAULT;
>>  	}
>>
>> -	/* Free the old CU mask memory if it is already allocated, then
>> -	 * allocate memory for the new CU mask.
>> -	 */
>> -	kfree(pqn->q->properties.cu_mask);
>> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>
>>  	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>  	pqn->q->properties.cu_mask = p->cu_mask;
>>
>>  	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
>>  							pqn->q);
>> +
>> +	pqn->q->properties.cu_mask = NULL;
>> +
>
>This won't work correctly. We need to save the cu_mask for later.
>Otherwise the next time dqm->ops.update_queue is called, for example in
>pqm_update_queue or pqm_set_gws, it will wipe out the CU mask in the MQD.

Let's just return when meeting a null cu_mask in update_cu_mask() to avoid that.
Like following,

static void update_cu_mask(struct mqd_manager *mm, void *mqd,
			   struct queue_properties *q)
{
	struct v10_compute_mqd *m;
	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */

	if (!q-> cu_mask || q->cu_mask_count == 0)
		return;
	......
}

Is this fine with you? Thanks!

Regards,
Lang
 
>Regards,
>  Felix
>
>
>>  	if (retval != 0)
>>  		return retval;
>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-29 23:32   ` Yu, Lang
@ 2021-09-30  1:47     ` Felix Kuehling
  2021-09-30  2:23       ` Yu, Lang
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Kuehling @ 2021-09-30  1:47 UTC (permalink / raw)
  To: Yu, Lang, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray

On 2021-09-29 7:32 p.m., Yu, Lang wrote:
> [AMD Official Use Only]
>
>
>
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: Wednesday, September 29, 2021 11:25 PM
>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>
>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>
>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy all
>>> created queues, when the kfd process is destroyed, some queues'
>>> cu_mask memory are not freed.
>>>
>>> To avoid forgetting to free them in some places, free them immediately
>>> after use.
>>>
>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>>>   drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>>> ++++------
>>>   2 files changed, 8 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct
>> kfd_process *p,
>>>   	retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>> cu_mask_size);
>>>   	if (retval) {
>>>   		pr_debug("Could not copy CU mask from userspace");
>>> -		kfree(properties.cu_mask);
>>> -		return -EFAULT;
>>> +		retval = -EFAULT;
>>> +		goto out;
>>>   	}
>>>
>>>   	mutex_lock(&p->mutex);
>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>> *filp, struct kfd_process *p,
>>>
>>>   	mutex_unlock(&p->mutex);
>>>
>>> -	if (retval)
>>> -		kfree(properties.cu_mask);
>>> +out:
>>> +	kfree(properties.cu_mask);
>>>
>>>   	return retval;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 243dd1efcdbf..4c81d690f31a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>> process_queue_manager *pqm, unsigned int qid)
>>>   			pdd->qpd.num_gws = 0;
>>>   		}
>>>
>>> -		kfree(pqn->q->properties.cu_mask);
>>> -		pqn->q->properties.cu_mask = NULL;
>>>   		uninit_queue(pqn->q);
>>>   	}
>>>
>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>> process_queue_manager *pqm, unsigned int qid,
>>>   		return -EFAULT;
>>>   	}
>>>
>>> -	/* Free the old CU mask memory if it is already allocated, then
>>> -	 * allocate memory for the new CU mask.
>>> -	 */
>>> -	kfree(pqn->q->properties.cu_mask);
>>> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>>
>>>   	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>>   	pqn->q->properties.cu_mask = p->cu_mask;
>>>
>>>   	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
>>>   							pqn->q);
>>> +
>>> +	pqn->q->properties.cu_mask = NULL;
>>> +
>> This won't work correctly. We need to save the cu_mask for later.
>> Otherwise the next time dqm->ops.update_queue is called, for example in
>> pqm_update_queue or pqm_set_gws, it will wipe out the CU mask in the MQD.
> Let's just return when meeting a null cu_mask in update_cu_mask() to avoid that.
> Like following,
>
> static void update_cu_mask(struct mqd_manager *mm, void *mqd,
> 			   struct queue_properties *q)
> {
> 	struct v10_compute_mqd *m;
> 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>
> 	if (!q-> cu_mask || q->cu_mask_count == 0)
> 		return;
> 	......
> }
>
> Is this fine with you? Thanks!

I think that could work. I still don't like it. It leaves the CU mask in 
the q->properties structure, but it's only ever used temporarily and 
doesn't need to be persistent. I'd argue, in this case, the cu_mask 
shouldn't be in the q->properties structure at all, but should be passed 
as an optional parameter into the dqm->ops.update_queue call.

But I think a simpler fix would be to move the freeing of the CU mask 
into uninit_queue. That would catch all cases where a queue gets 
destroyed, including the process termination case.

Regards,
   Felix


>
> Regards,
> Lang
>   
>> Regards,
>>    Felix
>>
>>
>>>   	if (retval != 0)
>>>   		return retval;
>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-30  1:47     ` Felix Kuehling
@ 2021-09-30  2:23       ` Yu, Lang
  2021-09-30  2:28         ` Felix Kuehling
  0 siblings, 1 reply; 9+ messages in thread
From: Yu, Lang @ 2021-09-30  2:23 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray



>-----Original Message-----
>From: Kuehling, Felix <Felix.Kuehling@amd.com>
>Sent: Thursday, September 30, 2021 9:47 AM
>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>> [AMD Official Use Only]
>>
>>
>>
>>> -----Original Message-----
>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>
>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>
>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy
>>>> all created queues, when the kfd process is destroyed, some queues'
>>>> cu_mask memory are not freed.
>>>>
>>>> To avoid forgetting to free them in some places, free them
>>>> immediately after use.
>>>>
>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>>>> ++++------
>>>>   2 files changed, 8 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>> *filp, struct
>>> kfd_process *p,
>>>>   	retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>> cu_mask_size);
>>>>   	if (retval) {
>>>>   		pr_debug("Could not copy CU mask from userspace");
>>>> -		kfree(properties.cu_mask);
>>>> -		return -EFAULT;
>>>> +		retval = -EFAULT;
>>>> +		goto out;
>>>>   	}
>>>>
>>>>   	mutex_lock(&p->mutex);
>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>> *filp, struct kfd_process *p,
>>>>
>>>>   	mutex_unlock(&p->mutex);
>>>>
>>>> -	if (retval)
>>>> -		kfree(properties.cu_mask);
>>>> +out:
>>>> +	kfree(properties.cu_mask);
>>>>
>>>>   	return retval;
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> index 243dd1efcdbf..4c81d690f31a 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>>> process_queue_manager *pqm, unsigned int qid)
>>>>   			pdd->qpd.num_gws = 0;
>>>>   		}
>>>>
>>>> -		kfree(pqn->q->properties.cu_mask);
>>>> -		pqn->q->properties.cu_mask = NULL;
>>>>   		uninit_queue(pqn->q);
>>>>   	}
>>>>
>>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>>> process_queue_manager *pqm, unsigned int qid,
>>>>   		return -EFAULT;
>>>>   	}
>>>>
>>>> -	/* Free the old CU mask memory if it is already allocated, then
>>>> -	 * allocate memory for the new CU mask.
>>>> -	 */
>>>> -	kfree(pqn->q->properties.cu_mask);
>>>> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>>>
>>>>   	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>>>   	pqn->q->properties.cu_mask = p->cu_mask;
>>>>
>>>>   	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
>>>>   							pqn->q);
>>>> +
>>>> +	pqn->q->properties.cu_mask = NULL;
>>>> +
>>> This won't work correctly. We need to save the cu_mask for later.
>>> Otherwise the next time dqm->ops.update_queue is called, for example
>>> in pqm_update_queue or pqm_set_gws, it will wipe out the CU mask in the
>MQD.
>> Let's just return when meeting a null cu_mask in update_cu_mask() to avoid
>that.
>> Like following,
>>
>> static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>> 			   struct queue_properties *q)
>> {
>> 	struct v10_compute_mqd *m;
>> 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>
>> 	if (!q-> cu_mask || q->cu_mask_count == 0)
>> 		return;
>> 	......
>> }
>>
>> Is this fine with you? Thanks!
>
>I think that could work. I still don't like it. It leaves the CU mask in the q-
>>properties structure, but it's only ever used temporarily and doesn't need to be
>persistent. I'd argue, in this case, the cu_mask shouldn't be in the q->properties
>structure at all, but should be passed as an optional parameter into the dqm-
>>ops.update_queue call.

The cu_mask is originally in q->properties structure. I didn't change that.
What I want to do is keeping the cu_mask memory allocation and deallocation just in kfd_ioctl_set_cu_mask.
instead of everywhere.
 
>But I think a simpler fix would be to move the freeing of the CU mask into
>uninit_queue. That would catch all cases where a queue gets destroyed, including
>the process termination case.

Yes, it' better to free queue related resource in one function.
But we allocate it in kfd_ioctl_set_cu_mask and free it in pqm_set_cu_mask, uninit_queue and so on.
It seems strange and error-prone. Unless we allocate it in create queue.  As you said, it is temporary.
It's not worth keeping it until queue is destroyed. Thanks.

Regards,
Lang

>Regards,
>   Felix
>
>
>>
>> Regards,
>> Lang
>>
>>> Regards,
>>>    Felix
>>>
>>>
>>>>   	if (retval != 0)
>>>>   		return retval;
>>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-30  2:23       ` Yu, Lang
@ 2021-09-30  2:28         ` Felix Kuehling
  2021-09-30  2:38           ` Yu, Lang
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Kuehling @ 2021-09-30  2:28 UTC (permalink / raw)
  To: Yu, Lang, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray

On 2021-09-29 10:23 p.m., Yu, Lang wrote:
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: Thursday, September 30, 2021 9:47 AM
>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>
>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>
>> On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>>> [AMD Official Use Only]
>>>
>>>
>>>
>>>> -----Original Message-----
>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>
>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>>
>>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy
>>>>> all created queues, when the kfd process is destroyed, some queues'
>>>>> cu_mask memory are not freed.
>>>>>
>>>>> To avoid forgetting to free them in some places, free them
>>>>> immediately after use.
>>>>>
>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>>>>> ++++------
>>>>>    2 files changed, 8 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>> *filp, struct
>>>> kfd_process *p,
>>>>>    	retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>>> cu_mask_size);
>>>>>    	if (retval) {
>>>>>    		pr_debug("Could not copy CU mask from userspace");
>>>>> -		kfree(properties.cu_mask);
>>>>> -		return -EFAULT;
>>>>> +		retval = -EFAULT;
>>>>> +		goto out;
>>>>>    	}
>>>>>
>>>>>    	mutex_lock(&p->mutex);
>>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>> *filp, struct kfd_process *p,
>>>>>
>>>>>    	mutex_unlock(&p->mutex);
>>>>>
>>>>> -	if (retval)
>>>>> -		kfree(properties.cu_mask);
>>>>> +out:
>>>>> +	kfree(properties.cu_mask);
>>>>>
>>>>>    	return retval;
>>>>>    }
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> index 243dd1efcdbf..4c81d690f31a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>    			pdd->qpd.num_gws = 0;
>>>>>    		}
>>>>>
>>>>> -		kfree(pqn->q->properties.cu_mask);
>>>>> -		pqn->q->properties.cu_mask = NULL;
>>>>>    		uninit_queue(pqn->q);
>>>>>    	}
>>>>>
>>>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>>>> process_queue_manager *pqm, unsigned int qid,
>>>>>    		return -EFAULT;
>>>>>    	}
>>>>>
>>>>> -	/* Free the old CU mask memory if it is already allocated, then
>>>>> -	 * allocate memory for the new CU mask.
>>>>> -	 */
>>>>> -	kfree(pqn->q->properties.cu_mask);
>>>>> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>>>>
>>>>>    	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>>>>    	pqn->q->properties.cu_mask = p->cu_mask;
>>>>>
>>>>>    	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
>>>>>    							pqn->q);
>>>>> +
>>>>> +	pqn->q->properties.cu_mask = NULL;
>>>>> +
>>>> This won't work correctly. We need to save the cu_mask for later.
>>>> Otherwise the next time dqm->ops.update_queue is called, for example
>>>> in pqm_update_queue or pqm_set_gws, it will wipe out the CU mask in the
>> MQD.
>>> Let's just return when meeting a null cu_mask in update_cu_mask() to avoid
>> that.
>>> Like following,
>>>
>>> static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>>> 			   struct queue_properties *q)
>>> {
>>> 	struct v10_compute_mqd *m;
>>> 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>>
>>> 	if (!q-> cu_mask || q->cu_mask_count == 0)
>>> 		return;
>>> 	......
>>> }
>>>
>>> Is this fine with you? Thanks!
>> I think that could work. I still don't like it. It leaves the CU mask in the q-
>>> properties structure, but it's only ever used temporarily and doesn't need to be
>> persistent. I'd argue, in this case, the cu_mask shouldn't be in the q->properties
>> structure at all, but should be passed as an optional parameter into the dqm-
>>> ops.update_queue call.
> The cu_mask is originally in q->properties structure. I didn't change that.
> What I want to do is keeping the cu_mask memory allocation and deallocation just in kfd_ioctl_set_cu_mask.
> instead of everywhere.

You're not changing where it is stored. But you're changing it from 
something persistent (while the queue exists) to something ephemeral 
(while the ioctl call is executing in the kernel). I think that would 
justify removing the persistent pointer from the q->properties structure.


>   
>> But I think a simpler fix would be to move the freeing of the CU mask into
>> uninit_queue. That would catch all cases where a queue gets destroyed, including
>> the process termination case.
> Yes, it' better to free queue related resource in one function.
> But we allocate it in kfd_ioctl_set_cu_mask and free it in pqm_set_cu_mask, uninit_queue and so on.

My proposal here is to only free it in uninit_queue (when the queue is 
destroyed) or in pqm_set_cu_mask (when it is replaced by a different 
cu_mask).

Regards,
   Felix


> It seems strange and error-prone. Unless we allocate it in create queue.  As you said, it is temporary.
> It's not worth keeping it until queue is destroyed. Thanks.
>
> Regards,
> Lang
>
>> Regards,
>>    Felix
>>
>>
>>> Regards,
>>> Lang
>>>
>>>> Regards,
>>>>     Felix
>>>>
>>>>
>>>>>    	if (retval != 0)
>>>>>    		return retval;
>>>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-30  2:28         ` Felix Kuehling
@ 2021-09-30  2:38           ` Yu, Lang
  2021-09-30  3:25             ` Felix Kuehling
  0 siblings, 1 reply; 9+ messages in thread
From: Yu, Lang @ 2021-09-30  2:38 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray



>-----Original Message-----
>From: Kuehling, Felix <Felix.Kuehling@amd.com>
>Sent: Thursday, September 30, 2021 10:28 AM
>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>On 2021-09-29 10:23 p.m., Yu, Lang wrote:
>>> -----Original Message-----
>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>> Sent: Thursday, September 30, 2021 9:47 AM
>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>
>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>
>>> On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>>>> [AMD Official Use Only]
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>> <Ray.Huang@amd.com>
>>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory
>>>>> leak
>>>>>
>>>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy
>>>>>> all created queues, when the kfd process is destroyed, some queues'
>>>>>> cu_mask memory are not freed.
>>>>>>
>>>>>> To avoid forgetting to free them in some places, free them
>>>>>> immediately after use.
>>>>>>
>>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>>> ---
>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>>>>>> ++++------
>>>>>>    2 files changed, 8 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>> *filp, struct
>>>>> kfd_process *p,
>>>>>>    	retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>>>> cu_mask_size);
>>>>>>    	if (retval) {
>>>>>>    		pr_debug("Could not copy CU mask from userspace");
>>>>>> -		kfree(properties.cu_mask);
>>>>>> -		return -EFAULT;
>>>>>> +		retval = -EFAULT;
>>>>>> +		goto out;
>>>>>>    	}
>>>>>>
>>>>>>    	mutex_lock(&p->mutex);
>>>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>> *filp, struct kfd_process *p,
>>>>>>
>>>>>>    	mutex_unlock(&p->mutex);
>>>>>>
>>>>>> -	if (retval)
>>>>>> -		kfree(properties.cu_mask);
>>>>>> +out:
>>>>>> +	kfree(properties.cu_mask);
>>>>>>
>>>>>>    	return retval;
>>>>>>    }
>>>>>> diff --git
>>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> index 243dd1efcdbf..4c81d690f31a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>>    			pdd->qpd.num_gws = 0;
>>>>>>    		}
>>>>>>
>>>>>> -		kfree(pqn->q->properties.cu_mask);
>>>>>> -		pqn->q->properties.cu_mask = NULL;
>>>>>>    		uninit_queue(pqn->q);
>>>>>>    	}
>>>>>>
>>>>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>>>>> process_queue_manager *pqm, unsigned int qid,
>>>>>>    		return -EFAULT;
>>>>>>    	}
>>>>>>
>>>>>> -	/* Free the old CU mask memory if it is already allocated, then
>>>>>> -	 * allocate memory for the new CU mask.
>>>>>> -	 */
>>>>>> -	kfree(pqn->q->properties.cu_mask);
>>>>>> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>>>>>
>>>>>>    	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>>>>>    	pqn->q->properties.cu_mask = p->cu_mask;
>>>>>>
>>>>>>    	retval = pqn->q->device->dqm->ops.update_queue(pqn->q-
>>device->dqm,
>>>>>>    							pqn->q);
>>>>>> +
>>>>>> +	pqn->q->properties.cu_mask = NULL;
>>>>>> +
>>>>> This won't work correctly. We need to save the cu_mask for later.
>>>>> Otherwise the next time dqm->ops.update_queue is called, for
>>>>> example in pqm_update_queue or pqm_set_gws, it will wipe out the CU
>>>>> mask in the
>>> MQD.
>>>> Let's just return when meeting a null cu_mask in update_cu_mask() to
>>>> avoid
>>> that.
>>>> Like following,
>>>>
>>>> static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>>>> 			   struct queue_properties *q)
>>>> {
>>>> 	struct v10_compute_mqd *m;
>>>> 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>>>
>>>> 	if (!q-> cu_mask || q->cu_mask_count == 0)
>>>> 		return;
>>>> 	......
>>>> }
>>>>
>>>> Is this fine with you? Thanks!
>>> I think that could work. I still don't like it. It leaves the CU mask
>>> in the q-
>>>> properties structure, but it's only ever used temporarily and
>>>> doesn't need to be
>>> persistent. I'd argue, in this case, the cu_mask shouldn't be in the
>>> q->properties structure at all, but should be passed as an optional
>>> parameter into the dqm-
>>>> ops.update_queue call.
>> The cu_mask is originally in q->properties structure. I didn't change that.
>> What I want to do is keeping the cu_mask memory allocation and deallocation
>just in kfd_ioctl_set_cu_mask.
>> instead of everywhere.
>
>You're not changing where it is stored. But you're changing it from something
>persistent (while the queue exists) to something ephemeral (while the ioctl call is
>executing in the kernel). I think that would justify removing the persistent pointer
>from the q->properties structure.

Mmm, actually it has been copied to mqd memory. It doesn't make too much sense
to keep it persistent.
  
>>
>>> But I think a simpler fix would be to move the freeing of the CU mask
>>> into uninit_queue. That would catch all cases where a queue gets
>>> destroyed, including the process termination case.
>> Yes, it' better to free queue related resource in one function.
>> But we allocate it in kfd_ioctl_set_cu_mask and free it in pqm_set_cu_mask,
>uninit_queue and so on.
>
>My proposal here is to only free it in uninit_queue (when the queue is
>destroyed) or in pqm_set_cu_mask (when it is replaced by a different cu_mask).

Anyway, I still think it's strange and error-pone to allocate it in kfd_ioctl_set_cu_mask 
and free it in uninit_queue and pqm_set_cu_mask. 

Regards,
Lang

>Regards,
>   Felix
>
>
>> It seems strange and error-prone. Unless we allocate it in create queue.  As you
>said, it is temporary.
>> It's not worth keeping it until queue is destroyed. Thanks.
>>
>> Regards,
>> Lang
>>
>>> Regards,
>>>    Felix
>>>
>>>
>>>> Regards,
>>>> Lang
>>>>
>>>>> Regards,
>>>>>     Felix
>>>>>
>>>>>
>>>>>>    	if (retval != 0)
>>>>>>    		return retval;
>>>>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-30  2:38           ` Yu, Lang
@ 2021-09-30  3:25             ` Felix Kuehling
  2021-09-30  3:48               ` Yu, Lang
  0 siblings, 1 reply; 9+ messages in thread
From: Felix Kuehling @ 2021-09-30  3:25 UTC (permalink / raw)
  To: Yu, Lang, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray

On 2021-09-29 10:38 p.m., Yu, Lang wrote:
>> -----Original Message-----
>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>> Sent: Thursday, September 30, 2021 10:28 AM
>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>> <Ray.Huang@amd.com>
>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>
>> On 2021-09-29 10:23 p.m., Yu, Lang wrote:
>>>> -----Original Message-----
>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>> Sent: Thursday, September 30, 2021 9:47 AM
>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>> <Ray.Huang@amd.com>
>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>>
>>>> On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>>>>> [AMD Official Use Only]
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>>> <Ray.Huang@amd.com>
>>>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory
>>>>>> leak
>>>>>>
>>>>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to destroy
>>>>>>> all created queues, when the kfd process is destroyed, some queues'
>>>>>>> cu_mask memory are not freed.
>>>>>>>
>>>>>>> To avoid forgetting to free them in some places, free them
>>>>>>> immediately after use.
>>>>>>>
>>>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>>>> ---
>>>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>>>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 10
>>>>>>> ++++------
>>>>>>>     2 files changed, 8 insertions(+), 10 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>>> *filp, struct
>>>>>> kfd_process *p,
>>>>>>>     	retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>>>>> cu_mask_size);
>>>>>>>     	if (retval) {
>>>>>>>     		pr_debug("Could not copy CU mask from userspace");
>>>>>>> -		kfree(properties.cu_mask);
>>>>>>> -		return -EFAULT;
>>>>>>> +		retval = -EFAULT;
>>>>>>> +		goto out;
>>>>>>>     	}
>>>>>>>
>>>>>>>     	mutex_lock(&p->mutex);
>>>>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>>> *filp, struct kfd_process *p,
>>>>>>>
>>>>>>>     	mutex_unlock(&p->mutex);
>>>>>>>
>>>>>>> -	if (retval)
>>>>>>> -		kfree(properties.cu_mask);
>>>>>>> +out:
>>>>>>> +	kfree(properties.cu_mask);
>>>>>>>
>>>>>>>     	return retval;
>>>>>>>     }
>>>>>>> diff --git
>>>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>> index 243dd1efcdbf..4c81d690f31a 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>>>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>>>     			pdd->qpd.num_gws = 0;
>>>>>>>     		}
>>>>>>>
>>>>>>> -		kfree(pqn->q->properties.cu_mask);
>>>>>>> -		pqn->q->properties.cu_mask = NULL;
>>>>>>>     		uninit_queue(pqn->q);
>>>>>>>     	}
>>>>>>>
>>>>>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>>>>>> process_queue_manager *pqm, unsigned int qid,
>>>>>>>     		return -EFAULT;
>>>>>>>     	}
>>>>>>>
>>>>>>> -	/* Free the old CU mask memory if it is already allocated, then
>>>>>>> -	 * allocate memory for the new CU mask.
>>>>>>> -	 */
>>>>>>> -	kfree(pqn->q->properties.cu_mask);
>>>>>>> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>>>>>>
>>>>>>>     	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>>>>>>     	pqn->q->properties.cu_mask = p->cu_mask;
>>>>>>>
>>>>>>>     	retval = pqn->q->device->dqm->ops.update_queue(pqn->q-
>>> device->dqm,
>>>>>>>     							pqn->q);
>>>>>>> +
>>>>>>> +	pqn->q->properties.cu_mask = NULL;
>>>>>>> +
>>>>>> This won't work correctly. We need to save the cu_mask for later.
>>>>>> Otherwise the next time dqm->ops.update_queue is called, for
>>>>>> example in pqm_update_queue or pqm_set_gws, it will wipe out the CU
>>>>>> mask in the
>>>> MQD.
>>>>> Let's just return when meeting a null cu_mask in update_cu_mask() to
>>>>> avoid
>>>> that.
>>>>> Like following,
>>>>>
>>>>> static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>>>>> 			   struct queue_properties *q)
>>>>> {
>>>>> 	struct v10_compute_mqd *m;
>>>>> 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>>>>
>>>>> 	if (!q-> cu_mask || q->cu_mask_count == 0)
>>>>> 		return;
>>>>> 	......
>>>>> }
>>>>>
>>>>> Is this fine with you? Thanks!
>>>> I think that could work. I still don't like it. It leaves the CU mask
>>>> in the q-
>>>>> properties structure, but it's only ever used temporarily and
>>>>> doesn't need to be
>>>> persistent. I'd argue, in this case, the cu_mask shouldn't be in the
>>>> q->properties structure at all, but should be passed as an optional
>>>> parameter into the dqm-
>>>>> ops.update_queue call.
>>> The cu_mask is originally in q->properties structure. I didn't change that.
>>> What I want to do is keeping the cu_mask memory allocation and deallocation
>> just in kfd_ioctl_set_cu_mask.
>>> instead of everywhere.
>> You're not changing where it is stored. But you're changing it from something
>> persistent (while the queue exists) to something ephemeral (while the ioctl call is
>> executing in the kernel). I think that would justify removing the persistent pointer
> >from the q->properties structure.
>
> Mmm, actually it has been copied to mqd memory. It doesn't make too much sense
> to keep it persistent.

The current update_mqd implementations update the CU mask in the MQD 
from the one in the q->properties every time. So the persistent CU mask 
is currently needed. You're right that this may not be necessary. But 
relying only on the persistent CU mask in the MQD would be a significant 
change in the way the CU mask is managed that would require updating all 
the ASIC-specific update_mqd implementations. Feel free to propose a 
patch that makes all those changes cleanly (including removing the 
cu_mask from the q->properties structure).

However, that goes beyond a simple memory leak fix. I think it would be 
worth it if it makes the code and data structures cleaner.


>    
>>>> But I think a simpler fix would be to move the freeing of the CU mask
>>>> into uninit_queue. That would catch all cases where a queue gets
>>>> destroyed, including the process termination case.
>>> Yes, it' better to free queue related resource in one function.
>>> But we allocate it in kfd_ioctl_set_cu_mask and free it in pqm_set_cu_mask,
>> uninit_queue and so on.
>>
>> My proposal here is to only free it in uninit_queue (when the queue is
>> destroyed) or in pqm_set_cu_mask (when it is replaced by a different cu_mask).
> Anyway, I still think it's strange and error-pone to allocate it in kfd_ioctl_set_cu_mask
> and free it in uninit_queue and pqm_set_cu_mask.

It would not change the code complexity from what we have now. You're 
just moving the free call from pqm_destroy_queue into uninit_queue. For 
a small bug fix, that's a pretty good deal.

Regards,
   Felix


>
> Regards,
> Lang
>
>> Regards,
>>    Felix
>>
>>
>>> It seems strange and error-prone. Unless we allocate it in create queue.  As you
>> said, it is temporary.
>>> It's not worth keeping it until queue is destroyed. Thanks.
>>>
>>> Regards,
>>> Lang
>>>
>>>> Regards,
>>>>     Felix
>>>>
>>>>
>>>>> Regards,
>>>>> Lang
>>>>>
>>>>>> Regards,
>>>>>>      Felix
>>>>>>
>>>>>>
>>>>>>>     	if (retval != 0)
>>>>>>>     		return retval;
>>>>>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
  2021-09-30  3:25             ` Felix Kuehling
@ 2021-09-30  3:48               ` Yu, Lang
  0 siblings, 0 replies; 9+ messages in thread
From: Yu, Lang @ 2021-09-30  3:48 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx; +Cc: Deucher, Alexander, Huang, Ray



>-----Original Message-----
>From: Kuehling, Felix <Felix.Kuehling@amd.com>
>Sent: Thursday, September 30, 2021 11:26 AM
>To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
><Ray.Huang@amd.com>
>Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>
>On 2021-09-29 10:38 p.m., Yu, Lang wrote:
>>> -----Original Message-----
>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>> Sent: Thursday, September 30, 2021 10:28 AM
>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>> <Ray.Huang@amd.com>
>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory leak
>>>
>>> On 2021-09-29 10:23 p.m., Yu, Lang wrote:
>>>>> -----Original Message-----
>>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>>> Sent: Thursday, September 30, 2021 9:47 AM
>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang, Ray
>>>>> <Ray.Huang@amd.com>
>>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory
>>>>> leak
>>>>>
>>>>> On 2021-09-29 7:32 p.m., Yu, Lang wrote:
>>>>>> [AMD Official Use Only]
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Kuehling, Felix <Felix.Kuehling@amd.com>
>>>>>>> Sent: Wednesday, September 29, 2021 11:25 PM
>>>>>>> To: Yu, Lang <Lang.Yu@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Huang,
>Ray
>>>>>>> <Ray.Huang@amd.com>
>>>>>>> Subject: Re: [PATCH] drm/amdkfd: fix a potential cu_mask memory
>>>>>>> leak
>>>>>>>
>>>>>>> Am 2021-09-29 um 4:22 a.m. schrieb Lang Yu:
>>>>>>>> If user doesn't explicitly call kfd_ioctl_destroy_queue to
>>>>>>>> destroy all created queues, when the kfd process is destroyed,
>some queues'
>>>>>>>> cu_mask memory are not freed.
>>>>>>>>
>>>>>>>> To avoid forgetting to free them in some places, free them
>>>>>>>> immediately after use.
>>>>>>>>
>>>>>>>> Signed-off-by: Lang Yu <lang.yu@amd.com>
>>>>>>>> ---
>>>>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_chardev.c               |  8 ++++----
>>>>>>>>     drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |
>10
>>>>>>>> ++++------
>>>>>>>>     2 files changed, 8 insertions(+), 10 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> index 4de907f3e66a..5c0e6dcf692a 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>>>>>>> @@ -451,8 +451,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>>>> *filp, struct
>>>>>>> kfd_process *p,
>>>>>>>>     	retval = copy_from_user(properties.cu_mask, cu_mask_ptr,
>>>>>>> cu_mask_size);
>>>>>>>>     	if (retval) {
>>>>>>>>     		pr_debug("Could not copy CU mask from userspace");
>>>>>>>> -		kfree(properties.cu_mask);
>>>>>>>> -		return -EFAULT;
>>>>>>>> +		retval = -EFAULT;
>>>>>>>> +		goto out;
>>>>>>>>     	}
>>>>>>>>
>>>>>>>>     	mutex_lock(&p->mutex);
>>>>>>>> @@ -461,8 +461,8 @@ static int kfd_ioctl_set_cu_mask(struct file
>>>>>>>> *filp, struct kfd_process *p,
>>>>>>>>
>>>>>>>>     	mutex_unlock(&p->mutex);
>>>>>>>>
>>>>>>>> -	if (retval)
>>>>>>>> -		kfree(properties.cu_mask);
>>>>>>>> +out:
>>>>>>>> +	kfree(properties.cu_mask);
>>>>>>>>
>>>>>>>>     	return retval;
>>>>>>>>     }
>>>>>>>> diff --git
>>>>>>>> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>>> index 243dd1efcdbf..4c81d690f31a 100644
>>>>>>>> ---
>a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>>> +++
>b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>>>>>>> @@ -394,8 +394,6 @@ int pqm_destroy_queue(struct
>>>>>>> process_queue_manager *pqm, unsigned int qid)
>>>>>>>>     			pdd->qpd.num_gws = 0;
>>>>>>>>     		}
>>>>>>>>
>>>>>>>> -		kfree(pqn->q->properties.cu_mask);
>>>>>>>> -		pqn->q->properties.cu_mask = NULL;
>>>>>>>>     		uninit_queue(pqn->q);
>>>>>>>>     	}
>>>>>>>>
>>>>>>>> @@ -448,16 +446,16 @@ int pqm_set_cu_mask(struct
>>>>>>> process_queue_manager *pqm, unsigned int qid,
>>>>>>>>     		return -EFAULT;
>>>>>>>>     	}
>>>>>>>>
>>>>>>>> -	/* Free the old CU mask memory if it is already allocated, then
>>>>>>>> -	 * allocate memory for the new CU mask.
>>>>>>>> -	 */
>>>>>>>> -	kfree(pqn->q->properties.cu_mask);
>>>>>>>> +	WARN_ON_ONCE(pqn->q->properties.cu_mask);
>>>>>>>>
>>>>>>>>     	pqn->q->properties.cu_mask_count = p->cu_mask_count;
>>>>>>>>     	pqn->q->properties.cu_mask = p->cu_mask;
>>>>>>>>
>>>>>>>>     	retval = pqn->q->device->dqm->ops.update_queue(pqn->q-
>>>> device->dqm,
>>>>>>>>     							pqn->q);
>>>>>>>> +
>>>>>>>> +	pqn->q->properties.cu_mask = NULL;
>>>>>>>> +
>>>>>>> This won't work correctly. We need to save the cu_mask for later.
>>>>>>> Otherwise the next time dqm->ops.update_queue is called, for
>>>>>>> example in pqm_update_queue or pqm_set_gws, it will wipe out the
>>>>>>> CU mask in the
>>>>> MQD.
>>>>>> Let's just return when meeting a null cu_mask in update_cu_mask()
>>>>>> to avoid
>>>>> that.
>>>>>> Like following,
>>>>>>
>>>>>> static void update_cu_mask(struct mqd_manager *mm, void *mqd,
>>>>>> 			   struct queue_properties *q)
>>>>>> {
>>>>>> 	struct v10_compute_mqd *m;
>>>>>> 	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
>>>>>>
>>>>>> 	if (!q-> cu_mask || q->cu_mask_count == 0)
>>>>>> 		return;
>>>>>> 	......
>>>>>> }
>>>>>>
>>>>>> Is this fine with you? Thanks!
>>>>> I think that could work. I still don't like it. It leaves the CU
>>>>> mask in the q-
>>>>>> properties structure, but it's only ever used temporarily and
>>>>>> doesn't need to be
>>>>> persistent. I'd argue, in this case, the cu_mask shouldn't be in
>>>>> the
>>>>> q->properties structure at all, but should be passed as an optional
>>>>> parameter into the dqm-
>>>>>> ops.update_queue call.
>>>> The cu_mask is originally in q->properties structure. I didn't change that.
>>>> What I want to do is keeping the cu_mask memory allocation and
>>>> deallocation
>>> just in kfd_ioctl_set_cu_mask.
>>>> instead of everywhere.
>>> You're not changing where it is stored. But you're changing it from
>>> something persistent (while the queue exists) to something ephemeral
>>> (while the ioctl call is executing in the kernel). I think that would
>>> justify removing the persistent pointer
>> >from the q->properties structure.
>>
>> Mmm, actually it has been copied to mqd memory. It doesn't make too
>> much sense to keep it persistent.
>
>The current update_mqd implementations update the CU mask in the MQD
>from the one in the q->properties every time. So the persistent CU mask is
>currently needed. 

The cu mask update actually happens when q->cu_mask_count != 0 in current update_cu_mask() implementation
(I think it should be q->cu_mask_count != 0 && q->cu_mask != NULL). So the fix may be enough.

You're right that this may not be necessary. But relying only
>on the persistent CU mask in the MQD would be a significant change in the
>way the CU mask is managed that would require updating all the ASIC-specific
>update_mqd implementations. Feel free to propose a patch that makes all
>those changes cleanly (including removing the cu_mask from the q-
>>properties structure).
>
>However, that goes beyond a simple memory leak fix. I think it would be
>worth it if it makes the code and data structures cleaner.

If you think it's better to propose a patch to make all those changes.
I will try to do it. I am just worry about there are many changes. Thanks!

Regards,
Lang

>>
>>>>> But I think a simpler fix would be to move the freeing of the CU mask
>>>>> into uninit_queue. That would catch all cases where a queue gets
>>>>> destroyed, including the process termination case.
>>>> Yes, it' better to free queue related resource in one function.
>>>> But we allocate it in kfd_ioctl_set_cu_mask and free it in
>pqm_set_cu_mask,
>>> uninit_queue and so on.
>>>
>>> My proposal here is to only free it in uninit_queue (when the queue is
>>> destroyed) or in pqm_set_cu_mask (when it is replaced by a different
>cu_mask).
>> Anyway, I still think it's strange and error-pone to allocate it in
>kfd_ioctl_set_cu_mask
>> and free it in uninit_queue and pqm_set_cu_mask.
>
>It would not change the code complexity from what we have now. You're
>just moving the free call from pqm_destroy_queue into uninit_queue. For
>a small bug fix, that's a pretty good deal.
>
>Regards,
>   Felix
>
>
>>
>> Regards,
>> Lang
>>
>>> Regards,
>>>    Felix
>>>
>>>
>>>> It seems strange and error-prone. Unless we allocate it in create queue.
>As you
>>> said, it is temporary.
>>>> It's not worth keeping it until queue is destroyed. Thanks.
>>>>
>>>> Regards,
>>>> Lang
>>>>
>>>>> Regards,
>>>>>     Felix
>>>>>
>>>>>
>>>>>> Regards,
>>>>>> Lang
>>>>>>
>>>>>>> Regards,
>>>>>>>      Felix
>>>>>>>
>>>>>>>
>>>>>>>>     	if (retval != 0)
>>>>>>>>     		return retval;
>>>>>>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-09-30  3:48 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-29  8:22 [PATCH] drm/amdkfd: fix a potential cu_mask memory leak Lang Yu
2021-09-29 15:24 ` Felix Kuehling
2021-09-29 23:32   ` Yu, Lang
2021-09-30  1:47     ` Felix Kuehling
2021-09-30  2:23       ` Yu, Lang
2021-09-30  2:28         ` Felix Kuehling
2021-09-30  2:38           ` Yu, Lang
2021-09-30  3:25             ` Felix Kuehling
2021-09-30  3:48               ` Yu, Lang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.