All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys
@ 2021-12-09  7:49 Xiaogang.Chen
  2021-12-09 18:40 ` Felix Kuehling
  0 siblings, 1 reply; 7+ messages in thread
From: Xiaogang.Chen @ 2021-12-09  7:49 UTC (permalink / raw)
  To: amd-gfx; +Cc: xiaogang.chen

From: Xiaogang Chen <xiaogang.chen@amd.com>

When application is about finish it destroys queues it has created by
an ioctl. Driver deletes queue entry(/sys/class/kfd/kfd/proc/pid/queues/queueid/)
which is directory including this queue all attributes. Low level kernel
code deletes all attributes under this directory. The lock from kernel is
on queue entry, not its attributes. At meantime another user space application
can read the attributes. There is possibility that the application can
hold/read the attributes while kernel is deleting the queue entry, cause
the application have invalid memory access, then killed by kernel.

Driver changes: explicitly create/destroy each attribute for each queue,
let kernel put lock on each attribute too.

Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 33 +++++++-----------------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0c3f911e3bf4..045da300749e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -546,6 +546,9 @@ struct queue {
 
 	/* procfs */
 	struct kobject kobj;
+	struct attribute attr_guid;
+	struct attribute attr_size;
+	struct attribute attr_type;
 };
 
 enum KFD_MQD_TYPE {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 9158f9754a24..04a5638f9196 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -73,6 +73,8 @@ static void evict_process_worker(struct work_struct *work);
 static void restore_process_worker(struct work_struct *work);
 
 static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd);
+static void kfd_sysfs_create_file(struct kobject *kobj, struct attribute *attr,
+				char *name);
 
 struct kfd_procfs_tree {
 	struct kobject *kobj;
@@ -441,35 +443,12 @@ static ssize_t kfd_sysfs_counters_show(struct kobject *kobj,
 	return 0;
 }
 
-static struct attribute attr_queue_size = {
-	.name = "size",
-	.mode = KFD_SYSFS_FILE_MODE
-};
-
-static struct attribute attr_queue_type = {
-	.name = "type",
-	.mode = KFD_SYSFS_FILE_MODE
-};
-
-static struct attribute attr_queue_gpuid = {
-	.name = "gpuid",
-	.mode = KFD_SYSFS_FILE_MODE
-};
-
-static struct attribute *procfs_queue_attrs[] = {
-	&attr_queue_size,
-	&attr_queue_type,
-	&attr_queue_gpuid,
-	NULL
-};
-
 static const struct sysfs_ops procfs_queue_ops = {
 	.show = kfd_procfs_queue_show,
 };
 
 static struct kobj_type procfs_queue_type = {
 	.sysfs_ops = &procfs_queue_ops,
-	.default_attrs = procfs_queue_attrs,
 };
 
 static const struct sysfs_ops procfs_stats_ops = {
@@ -511,6 +490,10 @@ int kfd_procfs_add_queue(struct queue *q)
 		return ret;
 	}
 
+	kfd_sysfs_create_file(&q->kobj, &q->attr_guid, "guid");
+	kfd_sysfs_create_file(&q->kobj, &q->attr_size, "size");
+	kfd_sysfs_create_file(&q->kobj, &q->attr_type, "type");
+
 	return 0;
 }
 
@@ -655,6 +638,10 @@ void kfd_procfs_del_queue(struct queue *q)
 	if (!q)
 		return;
 
+	sysfs_remove_file(&q->kobj, &q->attr_guid);
+	sysfs_remove_file(&q->kobj, &q->attr_size);
+	sysfs_remove_file(&q->kobj, &q->attr_type);
+
 	kobject_del(&q->kobj);
 	kobject_put(&q->kobj);
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys
  2021-12-09  7:49 [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys Xiaogang.Chen
@ 2021-12-09 18:40 ` Felix Kuehling
  2021-12-09 22:14   ` Chen, Xiaogang
  0 siblings, 1 reply; 7+ messages in thread
From: Felix Kuehling @ 2021-12-09 18:40 UTC (permalink / raw)
  To: Xiaogang.Chen, amd-gfx

Am 2021-12-09 um 2:49 a.m. schrieb Xiaogang.Chen:
> From: Xiaogang Chen <xiaogang.chen@amd.com>
>
> When application is about finish it destroys queues it has created by
> an ioctl. Driver deletes queue entry(/sys/class/kfd/kfd/proc/pid/queues/queueid/)
> which is directory including this queue all attributes. Low level kernel
> code deletes all attributes under this directory. The lock from kernel is
> on queue entry, not its attributes. At meantime another user space application
> can read the attributes. There is possibility that the application can
> hold/read the attributes while kernel is deleting the queue entry, cause
> the application have invalid memory access, then killed by kernel.
>
> Driver changes: explicitly create/destroy each attribute for each queue,
> let kernel put lock on each attribute too.

Is this working around a bug in kobject_del? Shouldn't that code take
care of the necessary locking itself?

Regards,
  Felix


>
> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 33 +++++++-----------------
>  2 files changed, 13 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 0c3f911e3bf4..045da300749e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -546,6 +546,9 @@ struct queue {
>  
>  	/* procfs */
>  	struct kobject kobj;
> +	struct attribute attr_guid;
> +	struct attribute attr_size;
> +	struct attribute attr_type;
>  };
>  
>  enum KFD_MQD_TYPE {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 9158f9754a24..04a5638f9196 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -73,6 +73,8 @@ static void evict_process_worker(struct work_struct *work);
>  static void restore_process_worker(struct work_struct *work);
>  
>  static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd);
> +static void kfd_sysfs_create_file(struct kobject *kobj, struct attribute *attr,
> +				char *name);
>  
>  struct kfd_procfs_tree {
>  	struct kobject *kobj;
> @@ -441,35 +443,12 @@ static ssize_t kfd_sysfs_counters_show(struct kobject *kobj,
>  	return 0;
>  }
>  
> -static struct attribute attr_queue_size = {
> -	.name = "size",
> -	.mode = KFD_SYSFS_FILE_MODE
> -};
> -
> -static struct attribute attr_queue_type = {
> -	.name = "type",
> -	.mode = KFD_SYSFS_FILE_MODE
> -};
> -
> -static struct attribute attr_queue_gpuid = {
> -	.name = "gpuid",
> -	.mode = KFD_SYSFS_FILE_MODE
> -};
> -
> -static struct attribute *procfs_queue_attrs[] = {
> -	&attr_queue_size,
> -	&attr_queue_type,
> -	&attr_queue_gpuid,
> -	NULL
> -};
> -
>  static const struct sysfs_ops procfs_queue_ops = {
>  	.show = kfd_procfs_queue_show,
>  };
>  
>  static struct kobj_type procfs_queue_type = {
>  	.sysfs_ops = &procfs_queue_ops,
> -	.default_attrs = procfs_queue_attrs,
>  };
>  
>  static const struct sysfs_ops procfs_stats_ops = {
> @@ -511,6 +490,10 @@ int kfd_procfs_add_queue(struct queue *q)
>  		return ret;
>  	}
>  
> +	kfd_sysfs_create_file(&q->kobj, &q->attr_guid, "guid");
> +	kfd_sysfs_create_file(&q->kobj, &q->attr_size, "size");
> +	kfd_sysfs_create_file(&q->kobj, &q->attr_type, "type");
> +
>  	return 0;
>  }
>  
> @@ -655,6 +638,10 @@ void kfd_procfs_del_queue(struct queue *q)
>  	if (!q)
>  		return;
>  
> +	sysfs_remove_file(&q->kobj, &q->attr_guid);
> +	sysfs_remove_file(&q->kobj, &q->attr_size);
> +	sysfs_remove_file(&q->kobj, &q->attr_type);
> +
>  	kobject_del(&q->kobj);
>  	kobject_put(&q->kobj);
>  }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys
  2021-12-09 18:40 ` Felix Kuehling
@ 2021-12-09 22:14   ` Chen, Xiaogang
  2021-12-09 22:27     ` Felix Kuehling
  0 siblings, 1 reply; 7+ messages in thread
From: Chen, Xiaogang @ 2021-12-09 22:14 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx; +Cc: Xiaogang Chen


On 12/9/2021 12:40 PM, Felix Kuehling wrote:
> Am 2021-12-09 um 2:49 a.m. schrieb Xiaogang.Chen:
>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>
>> When application is about finish it destroys queues it has created by
>> an ioctl. Driver deletes queue entry(/sys/class/kfd/kfd/proc/pid/queues/queueid/)
>> which is directory including this queue all attributes. Low level kernel
>> code deletes all attributes under this directory. The lock from kernel is
>> on queue entry, not its attributes. At meantime another user space application
>> can read the attributes. There is possibility that the application can
>> hold/read the attributes while kernel is deleting the queue entry, cause
>> the application have invalid memory access, then killed by kernel.
>>
>> Driver changes: explicitly create/destroy each attribute for each queue,
>> let kernel put lock on each attribute too.
> Is this working around a bug in kobject_del? Shouldn't that code take
> care of the necessary locking itself?
>
> Regards,
>    Felix

The patches do not change kobject/kernfs that are too low level and would involve deeper discussions.
Made changes at higher level(kfd) instead.

Have tested with MSF tool overnight.

Thanks
Xiaogang

>
>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 33 +++++++-----------------
>>   2 files changed, 13 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 0c3f911e3bf4..045da300749e 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -546,6 +546,9 @@ struct queue {
>>   
>>   	/* procfs */
>>   	struct kobject kobj;
>> +	struct attribute attr_guid;
>> +	struct attribute attr_size;
>> +	struct attribute attr_type;
>>   };
>>   
>>   enum KFD_MQD_TYPE {
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 9158f9754a24..04a5638f9196 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -73,6 +73,8 @@ static void evict_process_worker(struct work_struct *work);
>>   static void restore_process_worker(struct work_struct *work);
>>   
>>   static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd);
>> +static void kfd_sysfs_create_file(struct kobject *kobj, struct attribute *attr,
>> +				char *name);
>>   
>>   struct kfd_procfs_tree {
>>   	struct kobject *kobj;
>> @@ -441,35 +443,12 @@ static ssize_t kfd_sysfs_counters_show(struct kobject *kobj,
>>   	return 0;
>>   }
>>   
>> -static struct attribute attr_queue_size = {
>> -	.name = "size",
>> -	.mode = KFD_SYSFS_FILE_MODE
>> -};
>> -
>> -static struct attribute attr_queue_type = {
>> -	.name = "type",
>> -	.mode = KFD_SYSFS_FILE_MODE
>> -};
>> -
>> -static struct attribute attr_queue_gpuid = {
>> -	.name = "gpuid",
>> -	.mode = KFD_SYSFS_FILE_MODE
>> -};
>> -
>> -static struct attribute *procfs_queue_attrs[] = {
>> -	&attr_queue_size,
>> -	&attr_queue_type,
>> -	&attr_queue_gpuid,
>> -	NULL
>> -};
>> -
>>   static const struct sysfs_ops procfs_queue_ops = {
>>   	.show = kfd_procfs_queue_show,
>>   };
>>   
>>   static struct kobj_type procfs_queue_type = {
>>   	.sysfs_ops = &procfs_queue_ops,
>> -	.default_attrs = procfs_queue_attrs,
>>   };
>>   
>>   static const struct sysfs_ops procfs_stats_ops = {
>> @@ -511,6 +490,10 @@ int kfd_procfs_add_queue(struct queue *q)
>>   		return ret;
>>   	}
>>   
>> +	kfd_sysfs_create_file(&q->kobj, &q->attr_guid, "guid");
>> +	kfd_sysfs_create_file(&q->kobj, &q->attr_size, "size");
>> +	kfd_sysfs_create_file(&q->kobj, &q->attr_type, "type");
>> +
>>   	return 0;
>>   }
>>   
>> @@ -655,6 +638,10 @@ void kfd_procfs_del_queue(struct queue *q)
>>   	if (!q)
>>   		return;
>>   
>> +	sysfs_remove_file(&q->kobj, &q->attr_guid);
>> +	sysfs_remove_file(&q->kobj, &q->attr_size);
>> +	sysfs_remove_file(&q->kobj, &q->attr_type);
>> +
>>   	kobject_del(&q->kobj);
>>   	kobject_put(&q->kobj);
>>   }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys
  2021-12-09 22:14   ` Chen, Xiaogang
@ 2021-12-09 22:27     ` Felix Kuehling
  2021-12-10  7:22       ` Christian König
  0 siblings, 1 reply; 7+ messages in thread
From: Felix Kuehling @ 2021-12-09 22:27 UTC (permalink / raw)
  To: Chen, Xiaogang, amd-gfx

Am 2021-12-09 um 5:14 p.m. schrieb Chen, Xiaogang:
>
> On 12/9/2021 12:40 PM, Felix Kuehling wrote:
>> Am 2021-12-09 um 2:49 a.m. schrieb Xiaogang.Chen:
>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>
>>> When application is about finish it destroys queues it has created by
>>> an ioctl. Driver deletes queue
>>> entry(/sys/class/kfd/kfd/proc/pid/queues/queueid/)
>>> which is directory including this queue all attributes. Low level
>>> kernel
>>> code deletes all attributes under this directory. The lock from
>>> kernel is
>>> on queue entry, not its attributes. At meantime another user space
>>> application
>>> can read the attributes. There is possibility that the application can
>>> hold/read the attributes while kernel is deleting the queue entry,
>>> cause
>>> the application have invalid memory access, then killed by kernel.
>>>
>>> Driver changes: explicitly create/destroy each attribute for each
>>> queue,
>>> let kernel put lock on each attribute too.
>> Is this working around a bug in kobject_del? Shouldn't that code take
>> care of the necessary locking itself?
>>
>> Regards,
>>    Felix
>
> The patches do not change kobject/kernfs that are too low level and
> would involve deeper discussions.
> Made changes at higher level(kfd) instead.
>
> Have tested with MSF tool overnight.

OK. I'm OK with your changes. The patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>

But I think we should let the kernfs folks know that there is a problem
anyway. It might save someone else a lot of time and headaches down the
line. Ideally we'd come up with a small reproducer (dummy driver and a
user mode tool (could just be a bash script)) that doesn't require
special AMD hardware and the whole ROCm stack.

Regards,
  Felix


>
> Thanks
> Xiaogang
>
>>
>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 33
>>> +++++++-----------------
>>>   2 files changed, 13 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 0c3f911e3bf4..045da300749e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -546,6 +546,9 @@ struct queue {
>>>         /* procfs */
>>>       struct kobject kobj;
>>> +    struct attribute attr_guid;
>>> +    struct attribute attr_size;
>>> +    struct attribute attr_type;
>>>   };
>>>     enum KFD_MQD_TYPE {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 9158f9754a24..04a5638f9196 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -73,6 +73,8 @@ static void evict_process_worker(struct
>>> work_struct *work);
>>>   static void restore_process_worker(struct work_struct *work);
>>>     static void kfd_process_device_destroy_cwsr_dgpu(struct
>>> kfd_process_device *pdd);
>>> +static void kfd_sysfs_create_file(struct kobject *kobj, struct
>>> attribute *attr,
>>> +                char *name);
>>>     struct kfd_procfs_tree {
>>>       struct kobject *kobj;
>>> @@ -441,35 +443,12 @@ static ssize_t kfd_sysfs_counters_show(struct
>>> kobject *kobj,
>>>       return 0;
>>>   }
>>>   -static struct attribute attr_queue_size = {
>>> -    .name = "size",
>>> -    .mode = KFD_SYSFS_FILE_MODE
>>> -};
>>> -
>>> -static struct attribute attr_queue_type = {
>>> -    .name = "type",
>>> -    .mode = KFD_SYSFS_FILE_MODE
>>> -};
>>> -
>>> -static struct attribute attr_queue_gpuid = {
>>> -    .name = "gpuid",
>>> -    .mode = KFD_SYSFS_FILE_MODE
>>> -};
>>> -
>>> -static struct attribute *procfs_queue_attrs[] = {
>>> -    &attr_queue_size,
>>> -    &attr_queue_type,
>>> -    &attr_queue_gpuid,
>>> -    NULL
>>> -};
>>> -
>>>   static const struct sysfs_ops procfs_queue_ops = {
>>>       .show = kfd_procfs_queue_show,
>>>   };
>>>     static struct kobj_type procfs_queue_type = {
>>>       .sysfs_ops = &procfs_queue_ops,
>>> -    .default_attrs = procfs_queue_attrs,
>>>   };
>>>     static const struct sysfs_ops procfs_stats_ops = {
>>> @@ -511,6 +490,10 @@ int kfd_procfs_add_queue(struct queue *q)
>>>           return ret;
>>>       }
>>>   +    kfd_sysfs_create_file(&q->kobj, &q->attr_guid, "guid");
>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_size, "size");
>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_type, "type");
>>> +
>>>       return 0;
>>>   }
>>>   @@ -655,6 +638,10 @@ void kfd_procfs_del_queue(struct queue *q)
>>>       if (!q)
>>>           return;
>>>   +    sysfs_remove_file(&q->kobj, &q->attr_guid);
>>> +    sysfs_remove_file(&q->kobj, &q->attr_size);
>>> +    sysfs_remove_file(&q->kobj, &q->attr_type);
>>> +
>>>       kobject_del(&q->kobj);
>>>       kobject_put(&q->kobj);
>>>   }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys
  2021-12-09 22:27     ` Felix Kuehling
@ 2021-12-10  7:22       ` Christian König
  2021-12-10 16:49         ` Felix Kuehling
  0 siblings, 1 reply; 7+ messages in thread
From: Christian König @ 2021-12-10  7:22 UTC (permalink / raw)
  To: Felix Kuehling, Chen, Xiaogang, amd-gfx

Am 09.12.21 um 23:27 schrieb Felix Kuehling:
> Am 2021-12-09 um 5:14 p.m. schrieb Chen, Xiaogang:
>> On 12/9/2021 12:40 PM, Felix Kuehling wrote:
>>> Am 2021-12-09 um 2:49 a.m. schrieb Xiaogang.Chen:
>>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>
>>>> When application is about finish it destroys queues it has created by
>>>> an ioctl. Driver deletes queue
>>>> entry(/sys/class/kfd/kfd/proc/pid/queues/queueid/)
>>>> which is directory including this queue all attributes. Low level
>>>> kernel
>>>> code deletes all attributes under this directory. The lock from
>>>> kernel is
>>>> on queue entry, not its attributes. At meantime another user space
>>>> application
>>>> can read the attributes. There is possibility that the application can
>>>> hold/read the attributes while kernel is deleting the queue entry,
>>>> cause
>>>> the application have invalid memory access, then killed by kernel.
>>>>
>>>> Driver changes: explicitly create/destroy each attribute for each
>>>> queue,
>>>> let kernel put lock on each attribute too.
>>> Is this working around a bug in kobject_del? Shouldn't that code take
>>> care of the necessary locking itself?
>>>
>>> Regards,
>>>     Felix
>> The patches do not change kobject/kernfs that are too low level and
>> would involve deeper discussions.
>> Made changes at higher level(kfd) instead.
>>
>> Have tested with MSF tool overnight.
> OK. I'm OK with your changes. The patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>
> But I think we should let the kernfs folks know that there is a problem
> anyway. It might save someone else a lot of time and headaches down the
> line. Ideally we'd come up with a small reproducer (dummy driver and a
> user mode tool (could just be a bash script)) that doesn't require
> special AMD hardware and the whole ROCm stack.

I think we could do this in the DKMS/release branches, but for upstream 
we should rather fix the underlying problem.

Additional to that this is explicitely what we should not do if I 
understood Greg correctly in previous discussions, but take that with a 
grain of salt since I'm not an expert on the topic.

Regards,
Christian.

>
> Regards,
>    Felix
>
>
>> Thanks
>> Xiaogang
>>
>>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
>>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c | 33
>>>> +++++++-----------------
>>>>    2 files changed, 13 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> index 0c3f911e3bf4..045da300749e 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> @@ -546,6 +546,9 @@ struct queue {
>>>>          /* procfs */
>>>>        struct kobject kobj;
>>>> +    struct attribute attr_guid;
>>>> +    struct attribute attr_size;
>>>> +    struct attribute attr_type;
>>>>    };
>>>>      enum KFD_MQD_TYPE {
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>> index 9158f9754a24..04a5638f9196 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>> @@ -73,6 +73,8 @@ static void evict_process_worker(struct
>>>> work_struct *work);
>>>>    static void restore_process_worker(struct work_struct *work);
>>>>      static void kfd_process_device_destroy_cwsr_dgpu(struct
>>>> kfd_process_device *pdd);
>>>> +static void kfd_sysfs_create_file(struct kobject *kobj, struct
>>>> attribute *attr,
>>>> +                char *name);
>>>>      struct kfd_procfs_tree {
>>>>        struct kobject *kobj;
>>>> @@ -441,35 +443,12 @@ static ssize_t kfd_sysfs_counters_show(struct
>>>> kobject *kobj,
>>>>        return 0;
>>>>    }
>>>>    -static struct attribute attr_queue_size = {
>>>> -    .name = "size",
>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>> -};
>>>> -
>>>> -static struct attribute attr_queue_type = {
>>>> -    .name = "type",
>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>> -};
>>>> -
>>>> -static struct attribute attr_queue_gpuid = {
>>>> -    .name = "gpuid",
>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>> -};
>>>> -
>>>> -static struct attribute *procfs_queue_attrs[] = {
>>>> -    &attr_queue_size,
>>>> -    &attr_queue_type,
>>>> -    &attr_queue_gpuid,
>>>> -    NULL
>>>> -};
>>>> -
>>>>    static const struct sysfs_ops procfs_queue_ops = {
>>>>        .show = kfd_procfs_queue_show,
>>>>    };
>>>>      static struct kobj_type procfs_queue_type = {
>>>>        .sysfs_ops = &procfs_queue_ops,
>>>> -    .default_attrs = procfs_queue_attrs,
>>>>    };
>>>>      static const struct sysfs_ops procfs_stats_ops = {
>>>> @@ -511,6 +490,10 @@ int kfd_procfs_add_queue(struct queue *q)
>>>>            return ret;
>>>>        }
>>>>    +    kfd_sysfs_create_file(&q->kobj, &q->attr_guid, "guid");
>>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_size, "size");
>>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_type, "type");
>>>> +
>>>>        return 0;
>>>>    }
>>>>    @@ -655,6 +638,10 @@ void kfd_procfs_del_queue(struct queue *q)
>>>>        if (!q)
>>>>            return;
>>>>    +    sysfs_remove_file(&q->kobj, &q->attr_guid);
>>>> +    sysfs_remove_file(&q->kobj, &q->attr_size);
>>>> +    sysfs_remove_file(&q->kobj, &q->attr_type);
>>>> +
>>>>        kobject_del(&q->kobj);
>>>>        kobject_put(&q->kobj);
>>>>    }


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys
  2021-12-10  7:22       ` Christian König
@ 2021-12-10 16:49         ` Felix Kuehling
  2021-12-10 17:08           ` Chen, Xiaogang
  0 siblings, 1 reply; 7+ messages in thread
From: Felix Kuehling @ 2021-12-10 16:49 UTC (permalink / raw)
  To: Christian König, Chen, Xiaogang, amd-gfx

On 2021-12-10 2:22 a.m., Christian König wrote:
> Am 09.12.21 um 23:27 schrieb Felix Kuehling:
>> Am 2021-12-09 um 5:14 p.m. schrieb Chen, Xiaogang:
>>> On 12/9/2021 12:40 PM, Felix Kuehling wrote:
>>>> Am 2021-12-09 um 2:49 a.m. schrieb Xiaogang.Chen:
>>>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>>
>>>>> When application is about finish it destroys queues it has created by
>>>>> an ioctl. Driver deletes queue
>>>>> entry(/sys/class/kfd/kfd/proc/pid/queues/queueid/)
>>>>> which is directory including this queue all attributes. Low level
>>>>> kernel
>>>>> code deletes all attributes under this directory. The lock from
>>>>> kernel is
>>>>> on queue entry, not its attributes. At meantime another user space
>>>>> application
>>>>> can read the attributes. There is possibility that the application 
>>>>> can
>>>>> hold/read the attributes while kernel is deleting the queue entry,
>>>>> cause
>>>>> the application have invalid memory access, then killed by kernel.
>>>>>
>>>>> Driver changes: explicitly create/destroy each attribute for each
>>>>> queue,
>>>>> let kernel put lock on each attribute too.
>>>> Is this working around a bug in kobject_del? Shouldn't that code take
>>>> care of the necessary locking itself?
>>>>
>>>> Regards,
>>>>     Felix
>>> The patches do not change kobject/kernfs that are too low level and
>>> would involve deeper discussions.
>>> Made changes at higher level(kfd) instead.
>>>
>>> Have tested with MSF tool overnight.
>> OK. I'm OK with your changes. The patch is
>>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>
>> But I think we should let the kernfs folks know that there is a problem
>> anyway. It might save someone else a lot of time and headaches down the
>> line. Ideally we'd come up with a small reproducer (dummy driver and a
>> user mode tool (could just be a bash script)) that doesn't require
>> special AMD hardware and the whole ROCm stack.
>
> I think we could do this in the DKMS/release branches, but for 
> upstream we should rather fix the underlying problem.

Sounds reasonable.


>
> Additional to that this is explicitely what we should not do if I 
> understood Greg correctly in previous discussions, but take that with 
> a grain of salt since I'm not an expert on the topic.

Sorry, I'm not following. What's the thing we should not do:

* make a simple reproducer?
* work around a bug somewhere else?

And what's the topic you discussed with Greg (KH, I presume)? Kernfs? 
Workarounds? Is there a record of those discussions for reference?

Thanks,
   Felix


>
> Regards,
> Christian.
>
>>
>> Regards,
>>    Felix
>>
>>
>>> Thanks
>>> Xiaogang
>>>
>>>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c | 33
>>>>> +++++++-----------------
>>>>>    2 files changed, 13 insertions(+), 23 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> index 0c3f911e3bf4..045da300749e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>> @@ -546,6 +546,9 @@ struct queue {
>>>>>          /* procfs */
>>>>>        struct kobject kobj;
>>>>> +    struct attribute attr_guid;
>>>>> +    struct attribute attr_size;
>>>>> +    struct attribute attr_type;
>>>>>    };
>>>>>      enum KFD_MQD_TYPE {
>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> index 9158f9754a24..04a5638f9196 100644
>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>> @@ -73,6 +73,8 @@ static void evict_process_worker(struct
>>>>> work_struct *work);
>>>>>    static void restore_process_worker(struct work_struct *work);
>>>>>      static void kfd_process_device_destroy_cwsr_dgpu(struct
>>>>> kfd_process_device *pdd);
>>>>> +static void kfd_sysfs_create_file(struct kobject *kobj, struct
>>>>> attribute *attr,
>>>>> +                char *name);
>>>>>      struct kfd_procfs_tree {
>>>>>        struct kobject *kobj;
>>>>> @@ -441,35 +443,12 @@ static ssize_t kfd_sysfs_counters_show(struct
>>>>> kobject *kobj,
>>>>>        return 0;
>>>>>    }
>>>>>    -static struct attribute attr_queue_size = {
>>>>> -    .name = "size",
>>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>>> -};
>>>>> -
>>>>> -static struct attribute attr_queue_type = {
>>>>> -    .name = "type",
>>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>>> -};
>>>>> -
>>>>> -static struct attribute attr_queue_gpuid = {
>>>>> -    .name = "gpuid",
>>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>>> -};
>>>>> -
>>>>> -static struct attribute *procfs_queue_attrs[] = {
>>>>> -    &attr_queue_size,
>>>>> -    &attr_queue_type,
>>>>> -    &attr_queue_gpuid,
>>>>> -    NULL
>>>>> -};
>>>>> -
>>>>>    static const struct sysfs_ops procfs_queue_ops = {
>>>>>        .show = kfd_procfs_queue_show,
>>>>>    };
>>>>>      static struct kobj_type procfs_queue_type = {
>>>>>        .sysfs_ops = &procfs_queue_ops,
>>>>> -    .default_attrs = procfs_queue_attrs,
>>>>>    };
>>>>>      static const struct sysfs_ops procfs_stats_ops = {
>>>>> @@ -511,6 +490,10 @@ int kfd_procfs_add_queue(struct queue *q)
>>>>>            return ret;
>>>>>        }
>>>>>    +    kfd_sysfs_create_file(&q->kobj, &q->attr_guid, "guid");
>>>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_size, "size");
>>>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_type, "type");
>>>>> +
>>>>>        return 0;
>>>>>    }
>>>>>    @@ -655,6 +638,10 @@ void kfd_procfs_del_queue(struct queue *q)
>>>>>        if (!q)
>>>>>            return;
>>>>>    +    sysfs_remove_file(&q->kobj, &q->attr_guid);
>>>>> +    sysfs_remove_file(&q->kobj, &q->attr_size);
>>>>> +    sysfs_remove_file(&q->kobj, &q->attr_type);
>>>>> +
>>>>>        kobject_del(&q->kobj);
>>>>>        kobject_put(&q->kobj);
>>>>>    }
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys
  2021-12-10 16:49         ` Felix Kuehling
@ 2021-12-10 17:08           ` Chen, Xiaogang
  0 siblings, 0 replies; 7+ messages in thread
From: Chen, Xiaogang @ 2021-12-10 17:08 UTC (permalink / raw)
  To: Felix Kuehling, Christian König, amd-gfx


On 12/10/2021 10:49 AM, Felix Kuehling wrote:
> On 2021-12-10 2:22 a.m., Christian König wrote:
>> Am 09.12.21 um 23:27 schrieb Felix Kuehling:
>>> Am 2021-12-09 um 5:14 p.m. schrieb Chen, Xiaogang:
>>>> On 12/9/2021 12:40 PM, Felix Kuehling wrote:
>>>>> Am 2021-12-09 um 2:49 a.m. schrieb Xiaogang.Chen:
>>>>>> From: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>>>
>>>>>> When application is about finish it destroys queues it has 
>>>>>> created by
>>>>>> an ioctl. Driver deletes queue
>>>>>> entry(/sys/class/kfd/kfd/proc/pid/queues/queueid/)
>>>>>> which is directory including this queue all attributes. Low level
>>>>>> kernel
>>>>>> code deletes all attributes under this directory. The lock from
>>>>>> kernel is
>>>>>> on queue entry, not its attributes. At meantime another user space
>>>>>> application
>>>>>> can read the attributes. There is possibility that the 
>>>>>> application can
>>>>>> hold/read the attributes while kernel is deleting the queue entry,
>>>>>> cause
>>>>>> the application have invalid memory access, then killed by kernel.
>>>>>>
>>>>>> Driver changes: explicitly create/destroy each attribute for each
>>>>>> queue,
>>>>>> let kernel put lock on each attribute too.
>>>>> Is this working around a bug in kobject_del? Shouldn't that code take
>>>>> care of the necessary locking itself?
>>>>>
>>>>> Regards,
>>>>>     Felix
>>>> The patches do not change kobject/kernfs that are too low level and
>>>> would involve deeper discussions.
>>>> Made changes at higher level(kfd) instead.
>>>>
>>>> Have tested with MSF tool overnight.
>>> OK. I'm OK with your changes. The patch is
>>>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>>
>>> But I think we should let the kernfs folks know that there is a problem
>>> anyway. It might save someone else a lot of time and headaches down the
>>> line. Ideally we'd come up with a small reproducer (dummy driver and a
>>> user mode tool (could just be a bash script)) that doesn't require
>>> special AMD hardware and the whole ROCm stack.
>>
>> I think we could do this in the DKMS/release branches, but for 
>> upstream we should rather fix the underlying problem.
>
> Sounds reasonable.
>
Some customers(ex, MSF) use upstream based kernel, not our release. The 
changes do not affect low level code and should work even after kernfs 
got changed.

Changing too low level code would involve more discussions and delay.

>
>>
>> Additional to that this is explicitely what we should not do if I 
>> understood Greg correctly in previous discussions, but take that with 
>> a grain of salt since I'm not an expert on the topic.
>
> Sorry, I'm not following. What's the thing we should not do:
>
> * make a simple reproducer?
> * work around a bug somewhere else?
>
> And what's the topic you discussed with Greg (KH, I presume)? Kernfs? 
> Workarounds? Is there a record of those discussions for reference?
>
> Thanks,
>   Felix
>
I am not sure either what the rules should be in this case.

Thanks

Xiaogang

>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>>    Felix
>>>
>>>
>>>> Thanks
>>>> Xiaogang
>>>>
>>>>>> Signed-off-by: Xiaogang Chen <xiaogang.chen@amd.com>
>>>>>> ---
>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +++
>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c | 33
>>>>>> +++++++-----------------
>>>>>>    2 files changed, 13 insertions(+), 23 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>> index 0c3f911e3bf4..045da300749e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>>>> @@ -546,6 +546,9 @@ struct queue {
>>>>>>          /* procfs */
>>>>>>        struct kobject kobj;
>>>>>> +    struct attribute attr_guid;
>>>>>> +    struct attribute attr_size;
>>>>>> +    struct attribute attr_type;
>>>>>>    };
>>>>>>      enum KFD_MQD_TYPE {
>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>> index 9158f9754a24..04a5638f9196 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>>>>> @@ -73,6 +73,8 @@ static void evict_process_worker(struct
>>>>>> work_struct *work);
>>>>>>    static void restore_process_worker(struct work_struct *work);
>>>>>>      static void kfd_process_device_destroy_cwsr_dgpu(struct
>>>>>> kfd_process_device *pdd);
>>>>>> +static void kfd_sysfs_create_file(struct kobject *kobj, struct
>>>>>> attribute *attr,
>>>>>> +                char *name);
>>>>>>      struct kfd_procfs_tree {
>>>>>>        struct kobject *kobj;
>>>>>> @@ -441,35 +443,12 @@ static ssize_t kfd_sysfs_counters_show(struct
>>>>>> kobject *kobj,
>>>>>>        return 0;
>>>>>>    }
>>>>>>    -static struct attribute attr_queue_size = {
>>>>>> -    .name = "size",
>>>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>>>> -};
>>>>>> -
>>>>>> -static struct attribute attr_queue_type = {
>>>>>> -    .name = "type",
>>>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>>>> -};
>>>>>> -
>>>>>> -static struct attribute attr_queue_gpuid = {
>>>>>> -    .name = "gpuid",
>>>>>> -    .mode = KFD_SYSFS_FILE_MODE
>>>>>> -};
>>>>>> -
>>>>>> -static struct attribute *procfs_queue_attrs[] = {
>>>>>> -    &attr_queue_size,
>>>>>> -    &attr_queue_type,
>>>>>> -    &attr_queue_gpuid,
>>>>>> -    NULL
>>>>>> -};
>>>>>> -
>>>>>>    static const struct sysfs_ops procfs_queue_ops = {
>>>>>>        .show = kfd_procfs_queue_show,
>>>>>>    };
>>>>>>      static struct kobj_type procfs_queue_type = {
>>>>>>        .sysfs_ops = &procfs_queue_ops,
>>>>>> -    .default_attrs = procfs_queue_attrs,
>>>>>>    };
>>>>>>      static const struct sysfs_ops procfs_stats_ops = {
>>>>>> @@ -511,6 +490,10 @@ int kfd_procfs_add_queue(struct queue *q)
>>>>>>            return ret;
>>>>>>        }
>>>>>>    +    kfd_sysfs_create_file(&q->kobj, &q->attr_guid, "guid");
>>>>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_size, "size");
>>>>>> +    kfd_sysfs_create_file(&q->kobj, &q->attr_type, "type");
>>>>>> +
>>>>>>        return 0;
>>>>>>    }
>>>>>>    @@ -655,6 +638,10 @@ void kfd_procfs_del_queue(struct queue *q)
>>>>>>        if (!q)
>>>>>>            return;
>>>>>>    +    sysfs_remove_file(&q->kobj, &q->attr_guid);
>>>>>> +    sysfs_remove_file(&q->kobj, &q->attr_size);
>>>>>> +    sysfs_remove_file(&q->kobj, &q->attr_type);
>>>>>> +
>>>>>>        kobject_del(&q->kobj);
>>>>>>        kobject_put(&q->kobj);
>>>>>>    }
>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-12-10 17:08 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-09  7:49 [PATCH] drm/amdkfd: explicitly create/destroy queue attributes under /sys Xiaogang.Chen
2021-12-09 18:40 ` Felix Kuehling
2021-12-09 22:14   ` Chen, Xiaogang
2021-12-09 22:27     ` Felix Kuehling
2021-12-10  7:22       ` Christian König
2021-12-10 16:49         ` Felix Kuehling
2021-12-10 17:08           ` Chen, Xiaogang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.