All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: force raven as "dgpu" path
@ 2020-08-07  8:25 Huang Rui
  2020-08-07 15:00 ` Felix Kuehling
  0 siblings, 1 reply; 4+ messages in thread
From: Huang Rui @ 2020-08-07  8:25 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix Kuehling, Huang Rui

We still have a few iommu issues which need to address, so force raven
as "dgpu" path for the moment.

Will enable IOMMUv2 since the issues are fixed.

Signed-off-by: Huang Rui <ray.huang@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c   | 6 ++++++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 6a250f8fcfb8..66d9f7280fe8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -22,6 +22,7 @@
 
 #include <linux/pci.h>
 #include <linux/acpi.h>
+#include <asm/processor.h>
 #include "kfd_crat.h"
 #include "kfd_priv.h"
 #include "kfd_topology.h"
@@ -781,6 +782,11 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
 		return -ENODATA;
 	}
 
+	/* Raven series will go with the fallback path to use virtual CRAT */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+	    boot_cpu_data.x86 == 0x17)
+		return -ENODATA;
+
 	pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL);
 	if (!pcrat_image)
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index d5e790f046b4..93179c928371 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -116,6 +116,7 @@ static const struct kfd_device_info carrizo_device_info = {
 	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
+#endif
 
 static const struct kfd_device_info raven_device_info = {
 	.asic_family = CHIP_RAVEN,
@@ -128,13 +129,12 @@ static const struct kfd_device_info raven_device_info = {
 	.num_of_watch_points = 4,
 	.mqd_size_aligned = MQD_SIZE_ALIGNED,
 	.supports_cwsr = true,
-	.needs_iommu_device = true,
+	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 1,
 	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
-#endif
 
 static const struct kfd_device_info hawaii_device_info = {
 	.asic_family = CHIP_HAWAII,
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdkfd: force raven as "dgpu" path
  2020-08-07  8:25 [PATCH] drm/amdkfd: force raven as "dgpu" path Huang Rui
@ 2020-08-07 15:00 ` Felix Kuehling
  2020-08-11 11:45   ` Huang Rui
  0 siblings, 1 reply; 4+ messages in thread
From: Felix Kuehling @ 2020-08-07 15:00 UTC (permalink / raw)
  To: Huang Rui, amd-gfx

Am 2020-08-07 um 4:25 a.m. schrieb Huang Rui:
> We still have a few iommu issues which need to address, so force raven
> as "dgpu" path for the moment.
>
> Will enable IOMMUv2 since the issues are fixed.

Do you mean "_when_ the issues are fixed"?

The current iommuv2 troubles aside, I think this change breaks existing
user mode. The existing Thunk is not prepared to see Raven as a dGPU. So
this is not something we want to do in an upstream Linux kernel change.

I suggest using the ignore_crat module parameter for the workaround
instead. You may need to duplicate the raven_device_info and pick the
right one depending on whether it is a dGPU or an APU. The only
difference would be the need_iommu_device option. If ignore_crat is set,
you can support Raven as a dGPU and require a corresponding Thunk change
that conditionally support Raven as a dGPU.

I think such a change would also be the right direction for supporting
Raven more universally in the future. It can be extended to
conditionally treat Raven as a dGPU automatically in some situations:

  * broken or missing CRAT table
  * IOMMUv2 disabled

Those are all situations where the current driver is broken anyway (and
always has been), so it would not be a kernel change that breaks
existing user mode.

In addition the Thunk could be changed to downgrade a Raven APU to dGPU
(by splitting the APU node into a separate CPU and dGPU node) if other
dGPUs are present in the systems to disable all the APU-specific code
paths and allow all the GPUs to work together seamlessly with SVM.

Regards,
  Felix


> Signed-off-by: Huang Rui <ray.huang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c   | 6 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 ++--
>  2 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index 6a250f8fcfb8..66d9f7280fe8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -22,6 +22,7 @@
>  
>  #include <linux/pci.h>
>  #include <linux/acpi.h>
> +#include <asm/processor.h>
>  #include "kfd_crat.h"
>  #include "kfd_priv.h"
>  #include "kfd_topology.h"
> @@ -781,6 +782,11 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
>  		return -ENODATA;
>  	}
>  
> +	/* Raven series will go with the fallback path to use virtual CRAT */
> +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
> +	    boot_cpu_data.x86 == 0x17)
> +		return -ENODATA;
> +
>  	pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL);
>  	if (!pcrat_image)
>  		return -ENOMEM;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index d5e790f046b4..93179c928371 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -116,6 +116,7 @@ static const struct kfd_device_info carrizo_device_info = {
>  	.num_xgmi_sdma_engines = 0,
>  	.num_sdma_queues_per_engine = 2,
>  };
> +#endif
>  
>  static const struct kfd_device_info raven_device_info = {
>  	.asic_family = CHIP_RAVEN,
> @@ -128,13 +129,12 @@ static const struct kfd_device_info raven_device_info = {
>  	.num_of_watch_points = 4,
>  	.mqd_size_aligned = MQD_SIZE_ALIGNED,
>  	.supports_cwsr = true,
> -	.needs_iommu_device = true,
> +	.needs_iommu_device = false,
>  	.needs_pci_atomics = true,
>  	.num_sdma_engines = 1,
>  	.num_xgmi_sdma_engines = 0,
>  	.num_sdma_queues_per_engine = 2,
>  };
> -#endif
>  
>  static const struct kfd_device_info hawaii_device_info = {
>  	.asic_family = CHIP_HAWAII,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdkfd: force raven as "dgpu" path
  2020-08-07 15:00 ` Felix Kuehling
@ 2020-08-11 11:45   ` Huang Rui
  2020-08-11 13:08     ` Felix Kuehling
  0 siblings, 1 reply; 4+ messages in thread
From: Huang Rui @ 2020-08-11 11:45 UTC (permalink / raw)
  To: Kuehling, Felix; +Cc: amd-gfx

On Fri, Aug 07, 2020 at 11:00:38PM +0800, Kuehling, Felix wrote:
> Am 2020-08-07 um 4:25 a.m. schrieb Huang Rui:
> > We still have a few iommu issues which need to address, so force raven
> > as "dgpu" path for the moment.
> >
> > Will enable IOMMUv2 since the issues are fixed.
> 
> Do you mean "_when_ the issues are fixed"?

Yes.
 
> The current iommuv2 troubles aside, I think this change breaks existing
> user mode. The existing Thunk is not prepared to see Raven as a dGPU. So
> this is not something we want to do in an upstream Linux kernel change.

Do you mean it may break the thunk without setting "is_dgpu" flag in the
hsa_gfxip_table for raven?

> 
> I suggest using the ignore_crat module parameter for the workaround
> instead. You may need to duplicate the raven_device_info and pick the
> right one depending on whether it is a dGPU or an APU. The only
> difference would be the need_iommu_device option. If ignore_crat is set,
> you can support Raven as a dGPU and require a corresponding Thunk change
> that conditionally support Raven as a dGPU.
> 
> I think such a change would also be the right direction for supporting
> Raven more universally in the future. It can be extended to
> conditionally treat Raven as a dGPU automatically in some situations:
> 
>   * broken or missing CRAT table
>   * IOMMUv2 disabled
> 
> Those are all situations where the current driver is broken anyway (and
> always has been), so it would not be a kernel change that breaks
> existing user mode.

OK, I think I understand it. We should use a parameter/flag such as
ignore_crat or something else to inform the user mode which path we should
go, treat it as dgpu or apu. Then thunk can detect the flag from kernel to
know how to handle the case. Am I right?

> 
> In addition the Thunk could be changed to downgrade a Raven APU to dGPU
> (by splitting the APU node into a separate CPU and dGPU node) if other
> dGPUs are present in the systems to disable all the APU-specific code
> paths and allow all the GPUs to work together seamlessly with SVM.

Thanks! Let me look at the thunk again.

For now, based on your comments, I would like to update patch as:

- Modify the ignore_crat parameter as tristate: "use", "ignore", and
  "auto". Usually, by default is "auto = use", but in some special case
  such as iommu v2 broken or no iommu v2 support yet (Renoir), we have to
  set "auto = ignore". Then treat it as "dgpu". And if CRAT table is
  broken/missing, we will fall back to treat it as "dgpu" as well.

What do you think of it?

Thanks,
Ray

> 
> Regards,
>   Felix
> 
> 
> > Signed-off-by: Huang Rui <ray.huang@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdkfd/kfd_crat.c   | 6 ++++++
> >  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 ++--
> >  2 files changed, 8 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > index 6a250f8fcfb8..66d9f7280fe8 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > @@ -22,6 +22,7 @@
> >  
> >  #include <linux/pci.h>
> >  #include <linux/acpi.h>
> > +#include <asm/processor.h>
> >  #include "kfd_crat.h"
> >  #include "kfd_priv.h"
> >  #include "kfd_topology.h"
> > @@ -781,6 +782,11 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
> >  		return -ENODATA;
> >  	}
> >  
> > +	/* Raven series will go with the fallback path to use virtual CRAT */
> > +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
> > +	    boot_cpu_data.x86 == 0x17)
> > +		return -ENODATA;
> > +
> >  	pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL);
> >  	if (!pcrat_image)
> >  		return -ENOMEM;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > index d5e790f046b4..93179c928371 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > @@ -116,6 +116,7 @@ static const struct kfd_device_info carrizo_device_info = {
> >  	.num_xgmi_sdma_engines = 0,
> >  	.num_sdma_queues_per_engine = 2,
> >  };
> > +#endif
> >  
> >  static const struct kfd_device_info raven_device_info = {
> >  	.asic_family = CHIP_RAVEN,
> > @@ -128,13 +129,12 @@ static const struct kfd_device_info raven_device_info = {
> >  	.num_of_watch_points = 4,
> >  	.mqd_size_aligned = MQD_SIZE_ALIGNED,
> >  	.supports_cwsr = true,
> > -	.needs_iommu_device = true,
> > +	.needs_iommu_device = false,
> >  	.needs_pci_atomics = true,
> >  	.num_sdma_engines = 1,
> >  	.num_xgmi_sdma_engines = 0,
> >  	.num_sdma_queues_per_engine = 2,
> >  };
> > -#endif
> >  
> >  static const struct kfd_device_info hawaii_device_info = {
> >  	.asic_family = CHIP_HAWAII,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/amdkfd: force raven as "dgpu" path
  2020-08-11 11:45   ` Huang Rui
@ 2020-08-11 13:08     ` Felix Kuehling
  0 siblings, 0 replies; 4+ messages in thread
From: Felix Kuehling @ 2020-08-11 13:08 UTC (permalink / raw)
  To: Huang Rui; +Cc: amd-gfx


Am 2020-08-11 um 7:45 a.m. schrieb Huang Rui:
> On Fri, Aug 07, 2020 at 11:00:38PM +0800, Kuehling, Felix wrote:
>> Am 2020-08-07 um 4:25 a.m. schrieb Huang Rui:
>>> We still have a few iommu issues which need to address, so force raven
>>> as "dgpu" path for the moment.
>>>
>>> Will enable IOMMUv2 since the issues are fixed.
>> Do you mean "_when_ the issues are fixed"?
> Yes.
>  
>> The current iommuv2 troubles aside, I think this change breaks existing
>> user mode. The existing Thunk is not prepared to see Raven as a dGPU. So
>> this is not something we want to do in an upstream Linux kernel change.
> Do you mean it may break the thunk without setting "is_dgpu" flag in the
> hsa_gfxip_table for raven?

Exactly. Try running your modified KFD against an unchanged Thunk. If it
fails, you cannot make that kernel change.


>
>> I suggest using the ignore_crat module parameter for the workaround
>> instead. You may need to duplicate the raven_device_info and pick the
>> right one depending on whether it is a dGPU or an APU. The only
>> difference would be the need_iommu_device option. If ignore_crat is set,
>> you can support Raven as a dGPU and require a corresponding Thunk change
>> that conditionally support Raven as a dGPU.
>>
>> I think such a change would also be the right direction for supporting
>> Raven more universally in the future. It can be extended to
>> conditionally treat Raven as a dGPU automatically in some situations:
>>
>>   * broken or missing CRAT table
>>   * IOMMUv2 disabled
>>
>> Those are all situations where the current driver is broken anyway (and
>> always has been), so it would not be a kernel change that breaks
>> existing user mode.
> OK, I think I understand it. We should use a parameter/flag such as
> ignore_crat or something else to inform the user mode which path we should
> go, treat it as dgpu or apu. Then thunk can detect the flag from kernel to
> know how to handle the case. Am I right?

User mode should not look at the ignore_crat parameter. It is not part
of the user-kernel API. The kernel parameter is to control kernel
behaviour. The point is, that the default behaviour should not change,
because that would break existing user mode.

A fixed user mode can use existing topology information to figure out
which way to go. If the CUs are in the CPU node, it's an APU. Otherwise
it's a dGPU.


>
>> In addition the Thunk could be changed to downgrade a Raven APU to dGPU
>> (by splitting the APU node into a separate CPU and dGPU node) if other
>> dGPUs are present in the systems to disable all the APU-specific code
>> paths and allow all the GPUs to work together seamlessly with SVM.
> Thanks! Let me look at the thunk again.
>
> For now, based on your comments, I would like to update patch as:
>
> - Modify the ignore_crat parameter as tristate: "use", "ignore", and
>   "auto". Usually, by default is "auto = use", but in some special case
>   such as iommu v2 broken or no iommu v2 support yet (Renoir), we have to
>   set "auto = ignore". Then treat it as "dgpu". And if CRAT table is
>   broken/missing, we will fall back to treat it as "dgpu" as well.
>
> What do you think of it?

If the CRAT is broken/missing or IOMMUv2 is not enabled, "use" is not a
very useful setting. I think you only need two states: "auto" and
"ignore". That's pretty much what we have now. Except we need more
conditions to not use the CRAT in the "auto" case.

Regards,
  Felix


>
> Thanks,
> Ray
>
>> Regards,
>>   Felix
>>
>>
>>> Signed-off-by: Huang Rui <ray.huang@amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c   | 6 ++++++
>>>  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 ++--
>>>  2 files changed, 8 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>>> index 6a250f8fcfb8..66d9f7280fe8 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>>> @@ -22,6 +22,7 @@
>>>  
>>>  #include <linux/pci.h>
>>>  #include <linux/acpi.h>
>>> +#include <asm/processor.h>
>>>  #include "kfd_crat.h"
>>>  #include "kfd_priv.h"
>>>  #include "kfd_topology.h"
>>> @@ -781,6 +782,11 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
>>>  		return -ENODATA;
>>>  	}
>>>  
>>> +	/* Raven series will go with the fallback path to use virtual CRAT */
>>> +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
>>> +	    boot_cpu_data.x86 == 0x17)
>>> +		return -ENODATA;
>>> +
>>>  	pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL);
>>>  	if (!pcrat_image)
>>>  		return -ENOMEM;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index d5e790f046b4..93179c928371 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -116,6 +116,7 @@ static const struct kfd_device_info carrizo_device_info = {
>>>  	.num_xgmi_sdma_engines = 0,
>>>  	.num_sdma_queues_per_engine = 2,
>>>  };
>>> +#endif
>>>  
>>>  static const struct kfd_device_info raven_device_info = {
>>>  	.asic_family = CHIP_RAVEN,
>>> @@ -128,13 +129,12 @@ static const struct kfd_device_info raven_device_info = {
>>>  	.num_of_watch_points = 4,
>>>  	.mqd_size_aligned = MQD_SIZE_ALIGNED,
>>>  	.supports_cwsr = true,
>>> -	.needs_iommu_device = true,
>>> +	.needs_iommu_device = false,
>>>  	.needs_pci_atomics = true,
>>>  	.num_sdma_engines = 1,
>>>  	.num_xgmi_sdma_engines = 0,
>>>  	.num_sdma_queues_per_engine = 2,
>>>  };
>>> -#endif
>>>  
>>>  static const struct kfd_device_info hawaii_device_info = {
>>>  	.asic_family = CHIP_HAWAII,
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-08-11 13:08 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-07  8:25 [PATCH] drm/amdkfd: force raven as "dgpu" path Huang Rui
2020-08-07 15:00 ` Felix Kuehling
2020-08-11 11:45   ` Huang Rui
2020-08-11 13:08     ` Felix Kuehling

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.