All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
@ 2017-11-06  9:20 Gary Sun
       [not found] ` <1509960040-17849-1-git-send-email-Gary.Sun-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Gary Sun @ 2017-11-06  9:20 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Gary Sun, Christian.Koenig-5C7GfCeVMHo

remove debugfs file in amdgpu_device_finish

Signed-off-by: Gary Sun <Gary.Sun@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |    1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4f919d3..6cfcb5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1250,6 +1250,7 @@ struct amdgpu_debugfs {
 int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 			     const struct drm_info_list *files,
 			     unsigned nfiles);
+int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev);
 int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 
 #if defined(CONFIG_DEBUG_FS)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7b7439f..ee800ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2520,6 +2520,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
 	amdgpu_doorbell_fini(adev);
 	amdgpu_pm_sysfs_fini(adev);
 	amdgpu_debugfs_regs_cleanup(adev);
+	amdgpu_debugfs_cleanup_files(adev);
 }
 
 
@@ -3304,6 +3305,23 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
 	return 0;
 }
 
+int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev)
+{
+	unsigned int i;
+
+	for (i = 0; i < adev->debugfs_count; i++) {
+#if defined(CONFIG_DEBUG_FS)
+		drm_debugfs_remove_files(adev->debugfs[i].files,
+				adev->debugfs[i].num_files,
+				adev->ddev->primary);
+#endif
+		adev->debugfs[i].files = NUL;
+		adev->debugfs[i].num_files = 0;
+	}
+	adev->debugfs_count = 0;
+	return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf,
-- 
1.7.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found] ` <1509960040-17849-1-git-send-email-Gary.Sun-5C7GfCeVMHo@public.gmane.org>
@ 2017-11-06  9:26   ` Christian König
       [not found]     ` <9eea88e9-886b-c35b-0fe7-9e7fbe243ba5-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2017-11-06  9:26 UTC (permalink / raw)
  To: Gary Sun, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 06.11.2017 um 10:20 schrieb Gary Sun:
> remove debugfs file in amdgpu_device_finish

NAK, the debugfs files are removed automatically by drm_debugfs_cleanup().

So that patch is unnecessary.

Regards,
Christian.

>
> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |    1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   18 ++++++++++++++++++
>   2 files changed, 19 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4f919d3..6cfcb5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1250,6 +1250,7 @@ struct amdgpu_debugfs {
>   int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   			     const struct drm_info_list *files,
>   			     unsigned nfiles);
> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev);
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   
>   #if defined(CONFIG_DEBUG_FS)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7b7439f..ee800ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2520,6 +2520,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
>   	amdgpu_doorbell_fini(adev);
>   	amdgpu_pm_sysfs_fini(adev);
>   	amdgpu_debugfs_regs_cleanup(adev);
> +	amdgpu_debugfs_cleanup_files(adev);
>   }
>   
>   
> @@ -3304,6 +3305,23 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < adev->debugfs_count; i++) {
> +#if defined(CONFIG_DEBUG_FS)
> +		drm_debugfs_remove_files(adev->debugfs[i].files,
> +				adev->debugfs[i].num_files,
> +				adev->ddev->primary);
> +#endif
> +		adev->debugfs[i].files = NUL;
> +		adev->debugfs[i].num_files = 0;
> +	}
> +	adev->debugfs_count = 0;
> +	return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found]     ` <9eea88e9-886b-c35b-0fe7-9e7fbe243ba5-5C7GfCeVMHo@public.gmane.org>
@ 2017-11-07  7:23       ` Sun, Gary
       [not found]         ` <DM5PR12MB15311826E7AD3AC5ADDF434896510-2J9CzHegvk/mU37preBipQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Sun, Gary @ 2017-11-07  7:23 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian,

The patch is for driver re- initialize  feature, not for driver exit or rmmod.  When the driver initialize failed at some point, the re- initialize  feature will do some little clean and then try to initialize driver again, then it will re-register some registered debugfs , so it will fail.  

Regards,
Gary


-----Original Message-----
From: Koenig, Christian 
Sent: Monday, November 06, 2017 5:26 PM
To: Sun, Gary <Gary.Sun@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish

Am 06.11.2017 um 10:20 schrieb Gary Sun:
> remove debugfs file in amdgpu_device_finish

NAK, the debugfs files are removed automatically by drm_debugfs_cleanup().

So that patch is unnecessary.

Regards,
Christian.

>
> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |    1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   18 ++++++++++++++++++
>   2 files changed, 19 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4f919d3..6cfcb5f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1250,6 +1250,7 @@ struct amdgpu_debugfs {
>   int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   			     const struct drm_info_list *files,
>   			     unsigned nfiles);
> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev);
>   int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>   
>   #if defined(CONFIG_DEBUG_FS)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7b7439f..ee800ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2520,6 +2520,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
>   	amdgpu_doorbell_fini(adev);
>   	amdgpu_pm_sysfs_fini(adev);
>   	amdgpu_debugfs_regs_cleanup(adev);
> +	amdgpu_debugfs_cleanup_files(adev);
>   }
>   
>   
> @@ -3304,6 +3305,23 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev) {
> +	unsigned int i;
> +
> +	for (i = 0; i < adev->debugfs_count; i++) { #if 
> +defined(CONFIG_DEBUG_FS)
> +		drm_debugfs_remove_files(adev->debugfs[i].files,
> +				adev->debugfs[i].num_files,
> +				adev->ddev->primary);
> +#endif
> +		adev->debugfs[i].files = NUL;
> +		adev->debugfs[i].num_files = 0;
> +	}
> +	adev->debugfs_count = 0;
> +	return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user 
> *buf,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found]         ` <DM5PR12MB15311826E7AD3AC5ADDF434896510-2J9CzHegvk/mU37preBipQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-11-07  7:47           ` Christian König
       [not found]             ` <66075b81-61bb-7897-cd7c-792548153748-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2017-11-07  7:47 UTC (permalink / raw)
  To: Sun, Gary, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Gary,

not sure what driver re-initialize feature you are talking about, but 
the last time I tried to re-initialize the driver it deadlocks in the 
modeset code because of some DC problem.

It's probably a good idea to fix that first, but in general please 
explain further what are you working on.

Regards,
Christian.

Am 07.11.2017 um 08:23 schrieb Sun, Gary:
> Hi Christian,
>
> The patch is for driver re- initialize  feature, not for driver exit or rmmod.  When the driver initialize failed at some point, the re- initialize  feature will do some little clean and then try to initialize driver again, then it will re-register some registered debugfs , so it will fail.
>
> Regards,
> Gary
>
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 06, 2017 5:26 PM
> To: Sun, Gary <Gary.Sun@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
>
> Am 06.11.2017 um 10:20 schrieb Gary Sun:
>> remove debugfs file in amdgpu_device_finish
> NAK, the debugfs files are removed automatically by drm_debugfs_cleanup().
>
> So that patch is unnecessary.
>
> Regards,
> Christian.
>
>> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |    1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   18 ++++++++++++++++++
>>    2 files changed, 19 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4f919d3..6cfcb5f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1250,6 +1250,7 @@ struct amdgpu_debugfs {
>>    int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>    			     const struct drm_info_list *files,
>>    			     unsigned nfiles);
>> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev);
>>    int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>>    
>>    #if defined(CONFIG_DEBUG_FS)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 7b7439f..ee800ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2520,6 +2520,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
>>    	amdgpu_doorbell_fini(adev);
>>    	amdgpu_pm_sysfs_fini(adev);
>>    	amdgpu_debugfs_regs_cleanup(adev);
>> +	amdgpu_debugfs_cleanup_files(adev);
>>    }
>>    
>>    
>> @@ -3304,6 +3305,23 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>    	return 0;
>>    }
>>    
>> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev) {
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < adev->debugfs_count; i++) { #if
>> +defined(CONFIG_DEBUG_FS)
>> +		drm_debugfs_remove_files(adev->debugfs[i].files,
>> +				adev->debugfs[i].num_files,
>> +				adev->ddev->primary);
>> +#endif
>> +		adev->debugfs[i].files = NUL;
>> +		adev->debugfs[i].num_files = 0;
>> +	}
>> +	adev->debugfs_count = 0;
>> +	return 0;
>> +}
>> +
>>    #if defined(CONFIG_DEBUG_FS)
>>    
>>    static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user
>> *buf,
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found]             ` <66075b81-61bb-7897-cd7c-792548153748-5C7GfCeVMHo@public.gmane.org>
@ 2017-11-07  8:02               ` Sun, Gary
       [not found]                 ` <DM5PR12MB15316B651B45CB8840F1F07E96510-2J9CzHegvk/mU37preBipQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Sun, Gary @ 2017-11-07  8:02 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Ding, Pixel

Hi Christian,

The feature is for GPU virtualization and has been checked in, you can refer to the following patch or commit 75b126427778218b36cfb68637e4f8d0e584b8ef.

From 2316518efc459928ad1d3d2d3511ea5fbda19475 Mon Sep 17 00:00:00 2001
From: pding <Pixel.Ding@amd.com>
Date: Mon, 23 Oct 2017 17:22:09 +0800
Subject: [PATCH 001/121] drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)

The exclusive mode has real-time limitation in reality, such like being
done in 300ms. It's easy observed if running many VF/VMs in single host
with heavy CPU workload.

If we find the init fails due to exclusive mode timeout, try it again.

v2:
 - rewrite the condition for readable value.

v3:
 - fix typo, add comments for sleep

Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: pding <Pixel.Ding@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Gary Sun <Gary.Sun@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   10 ++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |   15 +++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 125f77d..385b10e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
 	r = amdgpu_init(adev);
 	if (r) {
+		/* failed in exclusive mode due to timeout */
+		if (amdgpu_sriov_vf(adev) &&
+		    !amdgpu_sriov_runtime(adev) &&
+		    amdgpu_virt_mmio_blocked(adev) &&
+		    !amdgpu_virt_wait_reset(adev)) {
+			dev_err(adev->dev, "VF exclusive mode timeout\n");
+			r = -EAGAIN;
+			goto failed;
+		}
 		dev_err(adev->dev, "amdgpu_init failed\n");
 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
 		amdgpu_fini(adev);
@@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	amdgpu_vf_error_trans_all(adev);
 	if (runtime)
 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 720139e..f313eee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
 int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
 {
 	struct amdgpu_device *adev;
-	int r, acpi_status;
+	int r, acpi_status, retry = 0;
 
 #ifdef CONFIG_DRM_AMDGPU_SI
 	if (!amdgpu_si_support) {
@@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
 		}
 	}
 #endif
+retry_init:
 
 	adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
 	if (adev == NULL) {
@@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
 	 * VRAM allocation
 	 */
 	r = amdgpu_device_init(adev, dev, dev->pdev, flags);
-	if (r) {
+	if (r == -EAGAIN && ++retry <= 3) {
+		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+		adev->virt.ops = NULL;
+		amdgpu_device_fini(adev);
+		kfree(adev);
+		dev->dev_private = NULL;
+		/* Don't request EX mode too frequently which is attacking */
+		msleep(5000);
+		dev_err(&dev->pdev->dev, "retry init %d\n", retry);
+		goto retry_init;
+	} else if (r) {
 		dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
 		goto out;
 	}
-- 
1.7.1


Regards,
Gary


-----Original Message-----
From: Koenig, Christian 
Sent: Tuesday, November 07, 2017 3:48 PM
To: Sun, Gary <Gary.Sun@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish

Hi Gary,

not sure what driver re-initialize feature you are talking about, but the last time I tried to re-initialize the driver it deadlocks in the modeset code because of some DC problem.

It's probably a good idea to fix that first, but in general please explain further what are you working on.

Regards,
Christian.

Am 07.11.2017 um 08:23 schrieb Sun, Gary:
> Hi Christian,
>
> The patch is for driver re- initialize  feature, not for driver exit or rmmod.  When the driver initialize failed at some point, the re- initialize  feature will do some little clean and then try to initialize driver again, then it will re-register some registered debugfs , so it will fail.
>
> Regards,
> Gary
>
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 06, 2017 5:26 PM
> To: Sun, Gary <Gary.Sun@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu:remove debugfs file in 
> amdgpu_device_finish
>
> Am 06.11.2017 um 10:20 schrieb Gary Sun:
>> remove debugfs file in amdgpu_device_finish
> NAK, the debugfs files are removed automatically by drm_debugfs_cleanup().
>
> So that patch is unnecessary.
>
> Regards,
> Christian.
>
>> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |    1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   18 ++++++++++++++++++
>>    2 files changed, 19 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4f919d3..6cfcb5f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1250,6 +1250,7 @@ struct amdgpu_debugfs {
>>    int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>    			     const struct drm_info_list *files,
>>    			     unsigned nfiles);
>> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev);
>>    int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>>    
>>    #if defined(CONFIG_DEBUG_FS)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 7b7439f..ee800ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2520,6 +2520,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
>>    	amdgpu_doorbell_fini(adev);
>>    	amdgpu_pm_sysfs_fini(adev);
>>    	amdgpu_debugfs_regs_cleanup(adev);
>> +	amdgpu_debugfs_cleanup_files(adev);
>>    }
>>    
>>    
>> @@ -3304,6 +3305,23 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>    	return 0;
>>    }
>>    
>> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev) {
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < adev->debugfs_count; i++) { #if
>> +defined(CONFIG_DEBUG_FS)
>> +		drm_debugfs_remove_files(adev->debugfs[i].files,
>> +				adev->debugfs[i].num_files,
>> +				adev->ddev->primary);
>> +#endif
>> +		adev->debugfs[i].files = NUL;
>> +		adev->debugfs[i].num_files = 0;
>> +	}
>> +	adev->debugfs_count = 0;
>> +	return 0;
>> +}
>> +
>>    #if defined(CONFIG_DEBUG_FS)
>>    
>>    static ssize_t amdgpu_debugfs_regs_read(struct file *f, char 
>> __user *buf,
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found]                 ` <DM5PR12MB15316B651B45CB8840F1F07E96510-2J9CzHegvk/mU37preBipQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-11-07  9:56                   ` Christian König
       [not found]                     ` <cae83e1b-fdbc-d746-1835-5c7fe41e692c-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2017-11-07  9:56 UTC (permalink / raw)
  To: Sun, Gary, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Ding, Pixel

Hi Gary,

well that patch is nonsense to begin with.

amdgpu_device_init() does quite a bunch of other initialization which is 
not cleaned up by amdgpu_device_fini(), so the debugfs files are only 
the tip of the iceberg here.

Please revert 2316518efc459928ad1d3d2d3511ea5fbda19475 and then we can 
try again from scratch.

What we need to do is return -EAGAIN from amdgpu_driver_load_kms. Then 
in amdgpu_pci_probe() we can catch that error and call 
drm_dev_register() multiple times if necessary.

This way we can also optionally pci_disable_device() / 
pci_enable_device() between tries if appropriate.

Regards,
Christian.

Am 07.11.2017 um 09:02 schrieb Sun, Gary:
> Hi Christian,
>
> The feature is for GPU virtualization and has been checked in, you can refer to the following patch or commit 75b126427778218b36cfb68637e4f8d0e584b8ef.
>
>  From 2316518efc459928ad1d3d2d3511ea5fbda19475 Mon Sep 17 00:00:00 2001
> From: pding <Pixel.Ding@amd.com>
> Date: Mon, 23 Oct 2017 17:22:09 +0800
> Subject: [PATCH 001/121] drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)
>
> The exclusive mode has real-time limitation in reality, such like being
> done in 300ms. It's easy observed if running many VF/VMs in single host
> with heavy CPU workload.
>
> If we find the init fails due to exclusive mode timeout, try it again.
>
> v2:
>   - rewrite the condition for readable value.
>
> v3:
>   - fix typo, add comments for sleep
>
> Acked-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: pding <Pixel.Ding@amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   10 ++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |   15 +++++++++++++--
>   2 files changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 125f77d..385b10e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   
>   	r = amdgpu_init(adev);
>   	if (r) {
> +		/* failed in exclusive mode due to timeout */
> +		if (amdgpu_sriov_vf(adev) &&
> +		    !amdgpu_sriov_runtime(adev) &&
> +		    amdgpu_virt_mmio_blocked(adev) &&
> +		    !amdgpu_virt_wait_reset(adev)) {
> +			dev_err(adev->dev, "VF exclusive mode timeout\n");
> +			r = -EAGAIN;
> +			goto failed;
> +		}
>   		dev_err(adev->dev, "amdgpu_init failed\n");
>   		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
>   		amdgpu_fini(adev);
> @@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	amdgpu_vf_error_trans_all(adev);
>   	if (runtime)
>   		vga_switcheroo_fini_domain_pm_ops(adev->dev);
> +
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 720139e..f313eee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>   {
>   	struct amdgpu_device *adev;
> -	int r, acpi_status;
> +	int r, acpi_status, retry = 0;
>   
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   	if (!amdgpu_si_support) {
> @@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>   		}
>   	}
>   #endif
> +retry_init:
>   
>   	adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
>   	if (adev == NULL) {
> @@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>   	 * VRAM allocation
>   	 */
>   	r = amdgpu_device_init(adev, dev, dev->pdev, flags);
> -	if (r) {
> +	if (r == -EAGAIN && ++retry <= 3) {
> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
> +		adev->virt.ops = NULL;
> +		amdgpu_device_fini(adev);
> +		kfree(adev);
> +		dev->dev_private = NULL;
> +		/* Don't request EX mode too frequently which is attacking */
> +		msleep(5000);
> +		dev_err(&dev->pdev->dev, "retry init %d\n", retry);
> +		goto retry_init;
> +	} else if (r) {
>   		dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
>   		goto out;
>   	}


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found]                     ` <cae83e1b-fdbc-d746-1835-5c7fe41e692c-5C7GfCeVMHo@public.gmane.org>
@ 2017-11-08  2:40                       ` Ding, Pixel
       [not found]                         ` <5C5A103F-1729-4BAB-860B-099B0EB46A35-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Ding, Pixel @ 2017-11-08  2:40 UTC (permalink / raw)
  To: Koenig, Christian, Sun, Gary, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian,

The retry_init only handles the failure caused by exclusive mode timeout. It checks the MMIO to see if there’s exclusive mode timeout, and retry init if there’s, otherwise just return error.

For exclusive timeout case, the host layer issues a FLR on this VF so driver needn't cleanup hardware status, amdgpu_device_fini here just is used to cleanup the software.

It’s tested and proved working correctly. Although the debugfs files are only the tip of the iceberg, it’s the only issue we found in this version of retry_init.

— 
Sincerely Yours,
Pixel







On 07/11/2017, 5:56 PM, "Koenig, Christian" <Christian.Koenig@amd.com> wrote:

>Hi Gary,
>
>well that patch is nonsense to begin with.
>
>amdgpu_device_init() does quite a bunch of other initialization which is 
>not cleaned up by amdgpu_device_fini(), so the debugfs files are only 
>the tip of the iceberg here.
>
>Please revert 2316518efc459928ad1d3d2d3511ea5fbda19475 and then we can 
>try again from scratch.
>
>What we need to do is return -EAGAIN from amdgpu_driver_load_kms. Then 
>in amdgpu_pci_probe() we can catch that error and call 
>drm_dev_register() multiple times if necessary.
>
>This way we can also optionally pci_disable_device() / 
>pci_enable_device() between tries if appropriate.
>
>Regards,
>Christian.
>
>Am 07.11.2017 um 09:02 schrieb Sun, Gary:
>> Hi Christian,
>>
>> The feature is for GPU virtualization and has been checked in, you can refer to the following patch or commit 75b126427778218b36cfb68637e4f8d0e584b8ef.
>>
>>  From 2316518efc459928ad1d3d2d3511ea5fbda19475 Mon Sep 17 00:00:00 2001
>> From: pding <Pixel.Ding@amd.com>
>> Date: Mon, 23 Oct 2017 17:22:09 +0800
>> Subject: [PATCH 001/121] drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)
>>
>> The exclusive mode has real-time limitation in reality, such like being
>> done in 300ms. It's easy observed if running many VF/VMs in single host
>> with heavy CPU workload.
>>
>> If we find the init fails due to exclusive mode timeout, try it again.
>>
>> v2:
>>   - rewrite the condition for readable value.
>>
>> v3:
>>   - fix typo, add comments for sleep
>>
>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>> Signed-off-by: pding <Pixel.Ding@amd.com>
>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   10 ++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |   15 +++++++++++++--
>>   2 files changed, 23 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 125f77d..385b10e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>   
>>   	r = amdgpu_init(adev);
>>   	if (r) {
>> +		/* failed in exclusive mode due to timeout */
>> +		if (amdgpu_sriov_vf(adev) &&
>> +		    !amdgpu_sriov_runtime(adev) &&
>> +		    amdgpu_virt_mmio_blocked(adev) &&
>> +		    !amdgpu_virt_wait_reset(adev)) {
>> +			dev_err(adev->dev, "VF exclusive mode timeout\n");
>> +			r = -EAGAIN;
>> +			goto failed;
>> +		}
>>   		dev_err(adev->dev, "amdgpu_init failed\n");
>>   		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
>>   		amdgpu_fini(adev);
>> @@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>   	amdgpu_vf_error_trans_all(adev);
>>   	if (runtime)
>>   		vga_switcheroo_fini_domain_pm_ops(adev->dev);
>> +
>>   	return r;
>>   }
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 720139e..f313eee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
>>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>   {
>>   	struct amdgpu_device *adev;
>> -	int r, acpi_status;
>> +	int r, acpi_status, retry = 0;
>>   
>>   #ifdef CONFIG_DRM_AMDGPU_SI
>>   	if (!amdgpu_si_support) {
>> @@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>   		}
>>   	}
>>   #endif
>> +retry_init:
>>   
>>   	adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
>>   	if (adev == NULL) {
>> @@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>   	 * VRAM allocation
>>   	 */
>>   	r = amdgpu_device_init(adev, dev, dev->pdev, flags);
>> -	if (r) {
>> +	if (r == -EAGAIN && ++retry <= 3) {
>> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
>> +		adev->virt.ops = NULL;
>> +		amdgpu_device_fini(adev);
>> +		kfree(adev);
>> +		dev->dev_private = NULL;
>> +		/* Don't request EX mode too frequently which is attacking */
>> +		msleep(5000);
>> +		dev_err(&dev->pdev->dev, "retry init %d\n", retry);
>> +		goto retry_init;
>> +	} else if (r) {
>>   		dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
>>   		goto out;
>>   	}
>
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found]                         ` <5C5A103F-1729-4BAB-860B-099B0EB46A35-5C7GfCeVMHo@public.gmane.org>
@ 2017-11-08  3:14                           ` Ding, Pixel
  2017-11-08  7:59                           ` Christian König
  1 sibling, 0 replies; 9+ messages in thread
From: Ding, Pixel @ 2017-11-08  3:14 UTC (permalink / raw)
  To: Koenig, Christian, Sun, Gary, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Hi Christian,

I give a quick try according to your suggestion. It also works and cleaner. I will send a new patch to revise the retry_init. Please help reviewing later.
— 
Sincerely Yours,
Pixel








On 08/11/2017, 10:40 AM, "Ding, Pixel" <Pixel.Ding@amd.com> wrote:

>Hi Christian,
>
>The retry_init only handles the failure caused by exclusive mode timeout. It checks the MMIO to see if there’s exclusive mode timeout, and retry init if there’s, otherwise just return error.
>
>For exclusive timeout case, the host layer issues a FLR on this VF so driver needn't cleanup hardware status, amdgpu_device_fini here just is used to cleanup the software.
>
>It’s tested and proved working correctly. Although the debugfs files are only the tip of the iceberg, it’s the only issue we found in this version of retry_init.
>
>— 
>Sincerely Yours,
>Pixel
>
>
>
>
>
>
>
>On 07/11/2017, 5:56 PM, "Koenig, Christian" <Christian.Koenig@amd.com> wrote:
>
>>Hi Gary,
>>
>>well that patch is nonsense to begin with.
>>
>>amdgpu_device_init() does quite a bunch of other initialization which is 
>>not cleaned up by amdgpu_device_fini(), so the debugfs files are only 
>>the tip of the iceberg here.
>>
>>Please revert 2316518efc459928ad1d3d2d3511ea5fbda19475 and then we can 
>>try again from scratch.
>>
>>What we need to do is return -EAGAIN from amdgpu_driver_load_kms. Then 
>>in amdgpu_pci_probe() we can catch that error and call 
>>drm_dev_register() multiple times if necessary.
>>
>>This way we can also optionally pci_disable_device() / 
>>pci_enable_device() between tries if appropriate.
>>
>>Regards,
>>Christian.
>>
>>Am 07.11.2017 um 09:02 schrieb Sun, Gary:
>>> Hi Christian,
>>>
>>> The feature is for GPU virtualization and has been checked in, you can refer to the following patch or commit 75b126427778218b36cfb68637e4f8d0e584b8ef.
>>>
>>>  From 2316518efc459928ad1d3d2d3511ea5fbda19475 Mon Sep 17 00:00:00 2001
>>> From: pding <Pixel.Ding@amd.com>
>>> Date: Mon, 23 Oct 2017 17:22:09 +0800
>>> Subject: [PATCH 001/121] drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)
>>>
>>> The exclusive mode has real-time limitation in reality, such like being
>>> done in 300ms. It's easy observed if running many VF/VMs in single host
>>> with heavy CPU workload.
>>>
>>> If we find the init fails due to exclusive mode timeout, try it again.
>>>
>>> v2:
>>>   - rewrite the condition for readable value.
>>>
>>> v3:
>>>   - fix typo, add comments for sleep
>>>
>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>> Signed-off-by: pding <Pixel.Ding@amd.com>
>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   10 ++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |   15 +++++++++++++--
>>>   2 files changed, 23 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 125f77d..385b10e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>   
>>>   	r = amdgpu_init(adev);
>>>   	if (r) {
>>> +		/* failed in exclusive mode due to timeout */
>>> +		if (amdgpu_sriov_vf(adev) &&
>>> +		    !amdgpu_sriov_runtime(adev) &&
>>> +		    amdgpu_virt_mmio_blocked(adev) &&
>>> +		    !amdgpu_virt_wait_reset(adev)) {
>>> +			dev_err(adev->dev, "VF exclusive mode timeout\n");
>>> +			r = -EAGAIN;
>>> +			goto failed;
>>> +		}
>>>   		dev_err(adev->dev, "amdgpu_init failed\n");
>>>   		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
>>>   		amdgpu_fini(adev);
>>> @@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>   	amdgpu_vf_error_trans_all(adev);
>>>   	if (runtime)
>>>   		vga_switcheroo_fini_domain_pm_ops(adev->dev);
>>> +
>>>   	return r;
>>>   }
>>>   
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index 720139e..f313eee 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
>>>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>   {
>>>   	struct amdgpu_device *adev;
>>> -	int r, acpi_status;
>>> +	int r, acpi_status, retry = 0;
>>>   
>>>   #ifdef CONFIG_DRM_AMDGPU_SI
>>>   	if (!amdgpu_si_support) {
>>> @@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>   		}
>>>   	}
>>>   #endif
>>> +retry_init:
>>>   
>>>   	adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
>>>   	if (adev == NULL) {
>>> @@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>   	 * VRAM allocation
>>>   	 */
>>>   	r = amdgpu_device_init(adev, dev, dev->pdev, flags);
>>> -	if (r) {
>>> +	if (r == -EAGAIN && ++retry <= 3) {
>>> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
>>> +		adev->virt.ops = NULL;
>>> +		amdgpu_device_fini(adev);
>>> +		kfree(adev);
>>> +		dev->dev_private = NULL;
>>> +		/* Don't request EX mode too frequently which is attacking */
>>> +		msleep(5000);
>>> +		dev_err(&dev->pdev->dev, "retry init %d\n", retry);
>>> +		goto retry_init;
>>> +	} else if (r) {
>>>   		dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
>>>   		goto out;
>>>   	}
>>
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish
       [not found]                         ` <5C5A103F-1729-4BAB-860B-099B0EB46A35-5C7GfCeVMHo@public.gmane.org>
  2017-11-08  3:14                           ` Ding, Pixel
@ 2017-11-08  7:59                           ` Christian König
  1 sibling, 0 replies; 9+ messages in thread
From: Christian König @ 2017-11-08  7:59 UTC (permalink / raw)
  To: Ding, Pixel, Sun, Gary, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> It’s tested and proved working correctly.
Well testing doesn't prove anything. Please enable at least KASAN, 
lockdep and kmemleak and then try again.

I'm pretty sure that at least kmemlead and lockdep will bail out quite a 
bunch of warnings.

Regards,
Christian.

Am 08.11.2017 um 03:40 schrieb Ding, Pixel:
> Hi Christian,
>
> The retry_init only handles the failure caused by exclusive mode timeout. It checks the MMIO to see if there’s exclusive mode timeout, and retry init if there’s, otherwise just return error.
>
> For exclusive timeout case, the host layer issues a FLR on this VF so driver needn't cleanup hardware status, amdgpu_device_fini here just is used to cleanup the software.
>
> It’s tested and proved working correctly. Although the debugfs files are only the tip of the iceberg, it’s the only issue we found in this version of retry_init.
>
> —
> Sincerely Yours,
> Pixel
>
>
>
>
>
>
>
> On 07/11/2017, 5:56 PM, "Koenig, Christian" <Christian.Koenig@amd.com> wrote:
>
>> Hi Gary,
>>
>> well that patch is nonsense to begin with.
>>
>> amdgpu_device_init() does quite a bunch of other initialization which is
>> not cleaned up by amdgpu_device_fini(), so the debugfs files are only
>> the tip of the iceberg here.
>>
>> Please revert 2316518efc459928ad1d3d2d3511ea5fbda19475 and then we can
>> try again from scratch.
>>
>> What we need to do is return -EAGAIN from amdgpu_driver_load_kms. Then
>> in amdgpu_pci_probe() we can catch that error and call
>> drm_dev_register() multiple times if necessary.
>>
>> This way we can also optionally pci_disable_device() /
>> pci_enable_device() between tries if appropriate.
>>
>> Regards,
>> Christian.
>>
>> Am 07.11.2017 um 09:02 schrieb Sun, Gary:
>>> Hi Christian,
>>>
>>> The feature is for GPU virtualization and has been checked in, you can refer to the following patch or commit 75b126427778218b36cfb68637e4f8d0e584b8ef.
>>>
>>>   From 2316518efc459928ad1d3d2d3511ea5fbda19475 Mon Sep 17 00:00:00 2001
>>> From: pding <Pixel.Ding@amd.com>
>>> Date: Mon, 23 Oct 2017 17:22:09 +0800
>>> Subject: [PATCH 001/121] drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)
>>>
>>> The exclusive mode has real-time limitation in reality, such like being
>>> done in 300ms. It's easy observed if running many VF/VMs in single host
>>> with heavy CPU workload.
>>>
>>> If we find the init fails due to exclusive mode timeout, try it again.
>>>
>>> v2:
>>>    - rewrite the condition for readable value.
>>>
>>> v3:
>>>    - fix typo, add comments for sleep
>>>
>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>> Signed-off-by: pding <Pixel.Ding@amd.com>
>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>> Signed-off-by: Gary Sun <Gary.Sun@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   10 ++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |   15 +++++++++++++--
>>>    2 files changed, 23 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 125f77d..385b10e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>    
>>>    	r = amdgpu_init(adev);
>>>    	if (r) {
>>> +		/* failed in exclusive mode due to timeout */
>>> +		if (amdgpu_sriov_vf(adev) &&
>>> +		    !amdgpu_sriov_runtime(adev) &&
>>> +		    amdgpu_virt_mmio_blocked(adev) &&
>>> +		    !amdgpu_virt_wait_reset(adev)) {
>>> +			dev_err(adev->dev, "VF exclusive mode timeout\n");
>>> +			r = -EAGAIN;
>>> +			goto failed;
>>> +		}
>>>    		dev_err(adev->dev, "amdgpu_init failed\n");
>>>    		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
>>>    		amdgpu_fini(adev);
>>> @@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>    	amdgpu_vf_error_trans_all(adev);
>>>    	if (runtime)
>>>    		vga_switcheroo_fini_domain_pm_ops(adev->dev);
>>> +
>>>    	return r;
>>>    }
>>>    
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index 720139e..f313eee 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
>>>    int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>    {
>>>    	struct amdgpu_device *adev;
>>> -	int r, acpi_status;
>>> +	int r, acpi_status, retry = 0;
>>>    
>>>    #ifdef CONFIG_DRM_AMDGPU_SI
>>>    	if (!amdgpu_si_support) {
>>> @@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>    		}
>>>    	}
>>>    #endif
>>> +retry_init:
>>>    
>>>    	adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
>>>    	if (adev == NULL) {
>>> @@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>    	 * VRAM allocation
>>>    	 */
>>>    	r = amdgpu_device_init(adev, dev, dev->pdev, flags);
>>> -	if (r) {
>>> +	if (r == -EAGAIN && ++retry <= 3) {
>>> +		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
>>> +		adev->virt.ops = NULL;
>>> +		amdgpu_device_fini(adev);
>>> +		kfree(adev);
>>> +		dev->dev_private = NULL;
>>> +		/* Don't request EX mode too frequently which is attacking */
>>> +		msleep(5000);
>>> +		dev_err(&dev->pdev->dev, "retry init %d\n", retry);
>>> +		goto retry_init;
>>> +	} else if (r) {
>>>    		dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
>>>    		goto out;
>>>    	}
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-11-08  7:59 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-06  9:20 [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish Gary Sun
     [not found] ` <1509960040-17849-1-git-send-email-Gary.Sun-5C7GfCeVMHo@public.gmane.org>
2017-11-06  9:26   ` Christian König
     [not found]     ` <9eea88e9-886b-c35b-0fe7-9e7fbe243ba5-5C7GfCeVMHo@public.gmane.org>
2017-11-07  7:23       ` Sun, Gary
     [not found]         ` <DM5PR12MB15311826E7AD3AC5ADDF434896510-2J9CzHegvk/mU37preBipQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-11-07  7:47           ` Christian König
     [not found]             ` <66075b81-61bb-7897-cd7c-792548153748-5C7GfCeVMHo@public.gmane.org>
2017-11-07  8:02               ` Sun, Gary
     [not found]                 ` <DM5PR12MB15316B651B45CB8840F1F07E96510-2J9CzHegvk/mU37preBipQdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-11-07  9:56                   ` Christian König
     [not found]                     ` <cae83e1b-fdbc-d746-1835-5c7fe41e692c-5C7GfCeVMHo@public.gmane.org>
2017-11-08  2:40                       ` Ding, Pixel
     [not found]                         ` <5C5A103F-1729-4BAB-860B-099B0EB46A35-5C7GfCeVMHo@public.gmane.org>
2017-11-08  3:14                           ` Ding, Pixel
2017-11-08  7:59                           ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.