[RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
@ 2020-08-11 13:30 Nirmoy Das
  2020-08-12 14:52 ` Andrey Grodzovsky
  0 siblings, 1 reply; 13+ messages in thread
From: Nirmoy Das @ 2020-08-11 13:30 UTC (permalink / raw)
  To: amd-gfx; +Cc: alexander.deucher, Nirmoy Das, christian.koenig

This patch will ignore non-fatal errors and try to
stop amdgpu's sw stack on fatal errors.

Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c1219af2e7d6..2b9ede3000ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -35,6 +35,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/vga_switcheroo.h>
 #include <drm/drm_probe_helper.h>
+#include <drm/drm_atomic_helper.h>
 #include <linux/mmu_notifier.h>
 
 #include "amdgpu.h"
@@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
 	.patchlevel = KMS_DRIVER_PATCHLEVEL,
 };
 
+static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
+						pci_channel_state_t state)
+{
+	struct drm_device *dev = pci_get_drvdata(pdev);
+	struct amdgpu_device *adev = dev->dev_private;
+	int i;
+	int ret = PCI_ERS_RESULT_DISCONNECT;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		ret = PCI_ERS_RESULT_CAN_RECOVER;
+		break;
+	default:
+		/* Disable power management */
+		adev->runpm = 0;
+		/* Suspend all IO operations */
+		amdgpu_fbdev_set_suspend(adev, 1);
+		cancel_delayed_work_sync(&adev->delayed_init_work);
+		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+			struct amdgpu_ring *ring = adev->rings[i];
+
+			if (!ring || !ring->sched.thread)
+				continue;
+
+			amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
+		}
+
+		if (adev->mode_info.mode_config_initialized) {
+			if (!amdgpu_device_has_dc_support(adev))
+				drm_helper_force_disable_all(adev->ddev);
+			else
+				drm_atomic_helper_shutdown(adev->ddev);
+		}
+
+		amdgpu_fence_driver_fini(adev);
+		amdgpu_fbdev_fini(adev);
+		/* Try to close drm device to stop applications
+		 * from opening dri files for further IO operations.
+		 * TODO: This will throw warning as ttm is not
+		 * cleaned perperly */
+		drm_dev_fini(dev);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct pci_error_handlers amdgpu_err_handler = {
+       .error_detected = amdgpu_pci_err_detected,
+};
+
+
 static struct pci_driver amdgpu_kms_pci_driver = {
 	.name = DRIVER_NAME,
 	.id_table = pciidlist,
@@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
 	.remove = amdgpu_pci_remove,
 	.shutdown = amdgpu_pci_shutdown,
 	.driver.pm = &amdgpu_pm_ops,
+	.err_handler = &amdgpu_err_handler,
 };
 
-
-
 static int __init amdgpu_init(void)
 {
 	int r;
-- 
2.27.0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-11 13:30 [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler Nirmoy Das
@ 2020-08-12 14:52 ` Andrey Grodzovsky
  2020-08-13 11:09   ` Nirmoy
  0 siblings, 1 reply; 13+ messages in thread
From: Andrey Grodzovsky @ 2020-08-12 14:52 UTC (permalink / raw)
  To: Nirmoy Das, amd-gfx; +Cc: alexander.deucher, christian.koenig


On 8/11/20 9:30 AM, Nirmoy Das wrote:
> This patch will ignore non-fatal errors and try to
> stop amdgpu's sw stack on fatal errors.
>
> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>   1 file changed, 54 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index c1219af2e7d6..2b9ede3000ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -35,6 +35,7 @@
>   #include <linux/pm_runtime.h>
>   #include <linux/vga_switcheroo.h>
>   #include <drm/drm_probe_helper.h>
> +#include <drm/drm_atomic_helper.h>
>   #include <linux/mmu_notifier.h>
>   
>   #include "amdgpu.h"
> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>   	.patchlevel = KMS_DRIVER_PATCHLEVEL,
>   };
>   
> +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
> +						pci_channel_state_t state)
> +{
> +	struct drm_device *dev = pci_get_drvdata(pdev);
> +	struct amdgpu_device *adev = dev->dev_private;
> +	int i;
> +	int ret = PCI_ERS_RESULT_DISCONNECT;
> +
> +	switch (state) {
> +	case pci_channel_io_normal:
> +		ret = PCI_ERS_RESULT_CAN_RECOVER;
> +		break;
> +	default:
> +		/* Disable power management */
> +		adev->runpm = 0;
> +		/* Suspend all IO operations */
> +		amdgpu_fbdev_set_suspend(adev, 1);
> +		cancel_delayed_work_sync(&adev->delayed_init_work);
> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> +			struct amdgpu_ring *ring = adev->rings[i];
> +
> +			if (!ring || !ring->sched.thread)
> +				continue;
> +
> +			amdgpu_job_stop_all_jobs_on_sched(&ring->sched);


You need to call drm_sched_stop first before calling this

> +		}
> +
> +		if (adev->mode_info.mode_config_initialized) {
> +			if (!amdgpu_device_has_dc_support(adev))
> +				drm_helper_force_disable_all(adev->ddev);
> +			else
> +				drm_atomic_helper_shutdown(adev->ddev);
> +		}
> +
> +		amdgpu_fence_driver_fini(adev);
> +		amdgpu_fbdev_fini(adev);
> +		/* Try to close drm device to stop applications
> +		 * from opening dri files for further IO operations.
> +		 * TODO: This will throw warning as ttm is not
> +		 * cleaned perperly */
> +		drm_dev_fini(dev);


I think user mode applications might still hold reference to the drm device 
through through drm_dev_get either by directly opening
the device file or indirectly through importing DMA buff, if so when the last of 
them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
might get called again causing use after free e.t.c issues. Maybe better to call 
here drm_dev_put then and so drm_dev_fini will get called when this
last user client releases his reference.

Also a general question - in my work on DPC recovery feature which tries to 
recover after PCIe error - once the PCI error has happened MMIO registers become
unaccessible for r/w as the PCI link is dead until after the PCI link is reset 
by the DPC driver (see 
https://www.kernel.org/doc/html/latest/PCI/pci-error-recovery.html section 6.1.4).
Your case is to try and gracefully to close the drm device once fatal error 
happened, didn't you encounter errors or warnings when accessing HW registers 
during any of the operations
above ?

Andrey


> +		break;
> +	}
> +
> +	return ret;
> +}
> +
> +static const struct pci_error_handlers amdgpu_err_handler = {
> +       .error_detected = amdgpu_pci_err_detected,
> +};
> +
> +
>   static struct pci_driver amdgpu_kms_pci_driver = {
>   	.name = DRIVER_NAME,
>   	.id_table = pciidlist,
> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>   	.remove = amdgpu_pci_remove,
>   	.shutdown = amdgpu_pci_shutdown,
>   	.driver.pm = &amdgpu_pm_ops,
> +	.err_handler = &amdgpu_err_handler,
>   };
>   
> -
> -
>   static int __init amdgpu_init(void)
>   {
>   	int r;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-12 14:52 ` Andrey Grodzovsky
@ 2020-08-13 11:09   ` Nirmoy
  2020-08-13 13:36     ` Alex Deucher
  2020-08-13 13:38     ` Andrey Grodzovsky
  0 siblings, 2 replies; 13+ messages in thread
From: Nirmoy @ 2020-08-13 11:09 UTC (permalink / raw)
  To: Andrey Grodzovsky, Nirmoy Das, amd-gfx
  Cc: alexander.deucher, christian.koenig


On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>
> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>> This patch will ignore non-fatal errors and try to
>> stop amdgpu's sw stack on fatal errors.
>>
>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>>   1 file changed, 54 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index c1219af2e7d6..2b9ede3000ee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -35,6 +35,7 @@
>>   #include <linux/pm_runtime.h>
>>   #include <linux/vga_switcheroo.h>
>>   #include <drm/drm_probe_helper.h>
>> +#include <drm/drm_atomic_helper.h>
>>   #include <linux/mmu_notifier.h>
>>     #include "amdgpu.h"
>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>       .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>   };
>>   +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
>> +                        pci_channel_state_t state)
>> +{
>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>> +    struct amdgpu_device *adev = dev->dev_private;
>> +    int i;
>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>> +
>> +    switch (state) {
>> +    case pci_channel_io_normal:
>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>> +        break;
>> +    default:
>> +        /* Disable power management */
>> +        adev->runpm = 0;
>> +        /* Suspend all IO operations */
>> +        amdgpu_fbdev_set_suspend(adev, 1);
>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +            struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +            if (!ring || !ring->sched.thread)
>> +                continue;
>> +
>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>
>
> You need to call drm_sched_stop first before calling this
>
>> +        }
>> +
>> +        if (adev->mode_info.mode_config_initialized) {
>> +            if (!amdgpu_device_has_dc_support(adev))
>> +                drm_helper_force_disable_all(adev->ddev);
>> +            else
>> +                drm_atomic_helper_shutdown(adev->ddev);
>> +        }
>> +
>> +        amdgpu_fence_driver_fini(adev);
>> +        amdgpu_fbdev_fini(adev);
>> +        /* Try to close drm device to stop applications
>> +         * from opening dri files for further IO operations.
>> +         * TODO: This will throw warning as ttm is not
>> +         * cleaned perperly */
>> +        drm_dev_fini(dev);
>
>
> I think user mode applications might still hold reference to the drm 
> device through through drm_dev_get either by directly opening
> the device file or indirectly through importing DMA buff, if so when 
> the last of them terminate 
> drm_dev_put->drm_dev_release->...->drm_dev_fini
> might get called again causing use after free e.t.c issues. Maybe 
> better to call here drm_dev_put then and so drm_dev_fini will get 
> called when this
> last user client releases his reference.


drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) 
never gets terminated after the AER error and drm files remains active. 
Simple cat on dri files

goes though amdgpu and spits out more errors.


>
> Also a general question - in my work on DPC recovery feature which 
> tries to recover after PCIe error - once the PCI error has happened 
> MMIO registers become
> unaccessible for r/w as the PCI link is dead until after the PCI link 
> is reset by the DPC driver (see 
> https://www.kernel.org/doc/html/latest/PCI/pci-error-recovery.html 
> section 6.1.4).
> Your case is to try and gracefully to close the drm device once fatal 
> error happened, didn't you encounter errors or warnings when accessing 
> HW registers during any of the operations
> above ?


As discussed over chat, it seems aer generated with aer-inject tool just 
triggers kernel PCI error APIs but the device is still active so I 
didn't encounter any errors when accessing HW registers.


Nirmoy


>
> Andrey
>
>
>> +        break;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static const struct pci_error_handlers amdgpu_err_handler = {
>> +       .error_detected = amdgpu_pci_err_detected,
>> +};
>> +
>> +
>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>       .name = DRIVER_NAME,
>>       .id_table = pciidlist,
>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver 
>> = {
>>       .remove = amdgpu_pci_remove,
>>       .shutdown = amdgpu_pci_shutdown,
>>       .driver.pm = &amdgpu_pm_ops,
>> +    .err_handler = &amdgpu_err_handler,
>>   };
>>   -
>> -
>>   static int __init amdgpu_init(void)
>>   {
>>       int r;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-13 11:09   ` Nirmoy
@ 2020-08-13 13:36     ` Alex Deucher
  2020-08-13 13:38     ` Andrey Grodzovsky
  1 sibling, 0 replies; 13+ messages in thread
From: Alex Deucher @ 2020-08-13 13:36 UTC (permalink / raw)
  To: Nirmoy
  Cc: Deucher, Alexander, Andrey Grodzovsky, Nirmoy Das,
	Christian Koenig, amd-gfx list

On Thu, Aug 13, 2020 at 7:06 AM Nirmoy <nirmodas@amd.com> wrote:
>
>
> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
> >
> > On 8/11/20 9:30 AM, Nirmoy Das wrote:
> >> This patch will ignore non-fatal errors and try to
> >> stop amdgpu's sw stack on fatal errors.
> >>
> >> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
> >>   1 file changed, 54 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> index c1219af2e7d6..2b9ede3000ee 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >> @@ -35,6 +35,7 @@
> >>   #include <linux/pm_runtime.h>
> >>   #include <linux/vga_switcheroo.h>
> >>   #include <drm/drm_probe_helper.h>
> >> +#include <drm/drm_atomic_helper.h>
> >>   #include <linux/mmu_notifier.h>
> >>     #include "amdgpu.h"
> >> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
> >>       .patchlevel = KMS_DRIVER_PATCHLEVEL,
> >>   };
> >>   +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
> >> +                        pci_channel_state_t state)
> >> +{
> >> +    struct drm_device *dev = pci_get_drvdata(pdev);
> >> +    struct amdgpu_device *adev = dev->dev_private;
> >> +    int i;
> >> +    int ret = PCI_ERS_RESULT_DISCONNECT;
> >> +
> >> +    switch (state) {
> >> +    case pci_channel_io_normal:
> >> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
> >> +        break;
> >> +    default:
> >> +        /* Disable power management */
> >> +        adev->runpm = 0;
> >> +        /* Suspend all IO operations */
> >> +        amdgpu_fbdev_set_suspend(adev, 1);
> >> + cancel_delayed_work_sync(&adev->delayed_init_work);
> >> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >> +            struct amdgpu_ring *ring = adev->rings[i];
> >> +
> >> +            if (!ring || !ring->sched.thread)
> >> +                continue;
> >> +
> >> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
> >
> >
> > You need to call drm_sched_stop first before calling this
> >
> >> +        }
> >> +
> >> +        if (adev->mode_info.mode_config_initialized) {
> >> +            if (!amdgpu_device_has_dc_support(adev))
> >> +                drm_helper_force_disable_all(adev->ddev);
> >> +            else
> >> +                drm_atomic_helper_shutdown(adev->ddev);
> >> +        }
> >> +
> >> +        amdgpu_fence_driver_fini(adev);
> >> +        amdgpu_fbdev_fini(adev);
> >> +        /* Try to close drm device to stop applications
> >> +         * from opening dri files for further IO operations.
> >> +         * TODO: This will throw warning as ttm is not
> >> +         * cleaned perperly */
> >> +        drm_dev_fini(dev);
> >
> >
> > I think user mode applications might still hold reference to the drm
> > device through through drm_dev_get either by directly opening
> > the device file or indirectly through importing DMA buff, if so when
> > the last of them terminate
> > drm_dev_put->drm_dev_release->...->drm_dev_fini
> > might get called again causing use after free e.t.c issues. Maybe
> > better to call here drm_dev_put then and so drm_dev_fini will get
> > called when this
> > last user client releases his reference.
>
>
> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway)
> never gets terminated after the AER error and drm files remains active.
> Simple cat on dri files
>
> goes though amdgpu and spits out more errors.
>
>
> >
> > Also a general question - in my work on DPC recovery feature which
> > tries to recover after PCIe error - once the PCI error has happened
> > MMIO registers become
> > unaccessible for r/w as the PCI link is dead until after the PCI link
> > is reset by the DPC driver (see
> > https://www.kernel.org/doc/html/latest/PCI/pci-error-recovery.html
> > section 6.1.4).
> > Your case is to try and gracefully to close the drm device once fatal
> > error happened, didn't you encounter errors or warnings when accessing
> > HW registers during any of the operations
> > above ?
>
>
> As discussed over chat, it seems aer generated with aer-inject tool just
> triggers kernel PCI error APIs but the device is still active so I
> didn't encounter any errors when accessing HW registers.

I think DPC is special because it triggers a special hardware mode
which resets the device.  Most other PCI errors do not.

Alex


>
>
> Nirmoy
>
>
> >
> > Andrey
> >
> >
> >> +        break;
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static const struct pci_error_handlers amdgpu_err_handler = {
> >> +       .error_detected = amdgpu_pci_err_detected,
> >> +};
> >> +
> >> +
> >>   static struct pci_driver amdgpu_kms_pci_driver = {
> >>       .name = DRIVER_NAME,
> >>       .id_table = pciidlist,
> >> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver
> >> = {
> >>       .remove = amdgpu_pci_remove,
> >>       .shutdown = amdgpu_pci_shutdown,
> >>       .driver.pm = &amdgpu_pm_ops,
> >> +    .err_handler = &amdgpu_err_handler,
> >>   };
> >>   -
> >> -
> >>   static int __init amdgpu_init(void)
> >>   {
> >>       int r;
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-13 11:09   ` Nirmoy
  2020-08-13 13:36     ` Alex Deucher
@ 2020-08-13 13:38     ` Andrey Grodzovsky
  2020-08-13 15:06       ` Nirmoy
  1 sibling, 1 reply; 13+ messages in thread
From: Andrey Grodzovsky @ 2020-08-13 13:38 UTC (permalink / raw)
  To: Nirmoy, Nirmoy Das, amd-gfx; +Cc: alexander.deucher, christian.koenig


On 8/13/20 7:09 AM, Nirmoy wrote:
>
> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>>
>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>>> This patch will ignore non-fatal errors and try to
>>> stop amdgpu's sw stack on fatal errors.
>>>
>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>>>   1 file changed, 54 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index c1219af2e7d6..2b9ede3000ee 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -35,6 +35,7 @@
>>>   #include <linux/pm_runtime.h>
>>>   #include <linux/vga_switcheroo.h>
>>>   #include <drm/drm_probe_helper.h>
>>> +#include <drm/drm_atomic_helper.h>
>>>   #include <linux/mmu_notifier.h>
>>>     #include "amdgpu.h"
>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>>       .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>>   };
>>>   +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
>>> +                        pci_channel_state_t state)
>>> +{
>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>>> +    struct amdgpu_device *adev = dev->dev_private;
>>> +    int i;
>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>>> +
>>> +    switch (state) {
>>> +    case pci_channel_io_normal:
>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>>> +        break;
>>> +    default:
>>> +        /* Disable power management */
>>> +        adev->runpm = 0;
>>> +        /* Suspend all IO operations */
>>> +        amdgpu_fbdev_set_suspend(adev, 1);
>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +            if (!ring || !ring->sched.thread)
>>> +                continue;
>>> +
>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>>
>>
>> You need to call drm_sched_stop first before calling this
>>
>>> +        }
>>> +
>>> +        if (adev->mode_info.mode_config_initialized) {
>>> +            if (!amdgpu_device_has_dc_support(adev))
>>> +                drm_helper_force_disable_all(adev->ddev);
>>> +            else
>>> +                drm_atomic_helper_shutdown(adev->ddev);
>>> +        }
>>> +
>>> +        amdgpu_fence_driver_fini(adev);
>>> +        amdgpu_fbdev_fini(adev);
>>> +        /* Try to close drm device to stop applications
>>> +         * from opening dri files for further IO operations.
>>> +         * TODO: This will throw warning as ttm is not
>>> +         * cleaned perperly */
>>> +        drm_dev_fini(dev);
>>
>>
>> I think user mode applications might still hold reference to the drm device 
>> through through drm_dev_get either by directly opening
>> the device file or indirectly through importing DMA buff, if so when the last 
>> of them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
>> might get called again causing use after free e.t.c issues. Maybe better to 
>> call here drm_dev_put then and so drm_dev_fini will get called when this
>> last user client releases his reference.
>
>
> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) never 
> gets terminated after the AER error and drm files remains active. Simple cat 
> on dri files
>
> goes though amdgpu and spits out more errors.


What happens if you kill the window manager after you closed drm device with 
your original code applied ? I would expect drm_dev_fini to be called again
for the reason i explained above and this would obviously would be wrong to happen.

Andrey


>
>
>>
>> Also a general question - in my work on DPC recovery feature which tries to 
>> recover after PCIe error - once the PCI error has happened MMIO registers become
>> unaccessible for r/w as the PCI link is dead until after the PCI link is 
>> reset by the DPC driver (see 
>> https://www.kernel.org/doc/html/latest/PCI/pci-error-recovery.html section 
>> 6.1.4).
>> Your case is to try and gracefully to close the drm device once fatal error 
>> happened, didn't you encounter errors or warnings when accessing HW registers 
>> during any of the operations
>> above ?
>
>
> As discussed over chat, it seems aer generated with aer-inject tool just 
> triggers kernel PCI error APIs but the device is still active so I didn't 
> encounter any errors when accessing HW registers.
>
>
> Nirmoy
>
>
>>
>> Andrey
>>
>>
>>> +        break;
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>>> +       .error_detected = amdgpu_pci_err_detected,
>>> +};
>>> +
>>> +
>>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>>       .name = DRIVER_NAME,
>>>       .id_table = pciidlist,
>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>>       .remove = amdgpu_pci_remove,
>>>       .shutdown = amdgpu_pci_shutdown,
>>>       .driver.pm = &amdgpu_pm_ops,
>>> +    .err_handler = &amdgpu_err_handler,
>>>   };
>>>   -
>>> -
>>>   static int __init amdgpu_init(void)
>>>   {
>>>       int r;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-13 13:38     ` Andrey Grodzovsky
@ 2020-08-13 15:06       ` Nirmoy
  2020-08-13 18:18         ` Andrey Grodzovsky
  0 siblings, 1 reply; 13+ messages in thread
From: Nirmoy @ 2020-08-13 15:06 UTC (permalink / raw)
  To: Andrey Grodzovsky, Nirmoy Das, amd-gfx
  Cc: alexander.deucher, christian.koenig


On 8/13/20 3:38 PM, Andrey Grodzovsky wrote:
>
> On 8/13/20 7:09 AM, Nirmoy wrote:
>>
>> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>>>
>>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>>>> This patch will ignore non-fatal errors and try to
>>>> stop amdgpu's sw stack on fatal errors.
>>>>
>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 
>>>> ++++++++++++++++++++++++-
>>>>   1 file changed, 54 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index c1219af2e7d6..2b9ede3000ee 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -35,6 +35,7 @@
>>>>   #include <linux/pm_runtime.h>
>>>>   #include <linux/vga_switcheroo.h>
>>>>   #include <drm/drm_probe_helper.h>
>>>> +#include <drm/drm_atomic_helper.h>
>>>>   #include <linux/mmu_notifier.h>
>>>>     #include "amdgpu.h"
>>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>>>       .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>>>   };
>>>>   +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev 
>>>> *pdev,
>>>> +                        pci_channel_state_t state)
>>>> +{
>>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>>>> +    struct amdgpu_device *adev = dev->dev_private;
>>>> +    int i;
>>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>>>> +
>>>> +    switch (state) {
>>>> +    case pci_channel_io_normal:
>>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>>>> +        break;
>>>> +    default:
>>>> +        /* Disable power management */
>>>> +        adev->runpm = 0;
>>>> +        /* Suspend all IO operations */
>>>> +        amdgpu_fbdev_set_suspend(adev, 1);
>>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>>> +
>>>> +            if (!ring || !ring->sched.thread)
>>>> +                continue;
>>>> +
>>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>>>
>>>
>>> You need to call drm_sched_stop first before calling this
>>>
>>>> +        }
>>>> +
>>>> +        if (adev->mode_info.mode_config_initialized) {
>>>> +            if (!amdgpu_device_has_dc_support(adev))
>>>> + drm_helper_force_disable_all(adev->ddev);
>>>> +            else
>>>> +                drm_atomic_helper_shutdown(adev->ddev);
>>>> +        }
>>>> +
>>>> +        amdgpu_fence_driver_fini(adev);
>>>> +        amdgpu_fbdev_fini(adev);
>>>> +        /* Try to close drm device to stop applications
>>>> +         * from opening dri files for further IO operations.
>>>> +         * TODO: This will throw warning as ttm is not
>>>> +         * cleaned perperly */
>>>> +        drm_dev_fini(dev);
>>>
>>>
>>> I think user mode applications might still hold reference to the drm 
>>> device through through drm_dev_get either by directly opening
>>> the device file or indirectly through importing DMA buff, if so when 
>>> the last of them terminate 
>>> drm_dev_put->drm_dev_release->...->drm_dev_fini
>>> might get called again causing use after free e.t.c issues. Maybe 
>>> better to call here drm_dev_put then and so drm_dev_fini will get 
>>> called when this
>>> last user client releases his reference.
>>
>>
>> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) 
>> never gets terminated after the AER error and drm files remains 
>> active. Simple cat on dri files
>>
>> goes though amdgpu and spits out more errors.
>
>
> What happens if you kill the window manager after you closed drm 
> device with your original code applied ? I would expect drm_dev_fini 
> to be called again
> for the reason i explained above and this would obviously would be 
> wrong to happen.

Hi Andrey,


hmm I quickly tried that, Kernel crashed and later rebooted after 
sometime. I don't have a serial console to check logs and there was no 
logs afterwards in journalctl.

drm_dev_put() had similar behavior, kernel/machine was inaccessible over 
ssh.


Did you face same behavior while testing gpu hotplug ?


Nirmoy


>
> Andrey
>
>
>>
>>
>>>
>>> Also a general question - in my work on DPC recovery feature which 
>>> tries to recover after PCIe error - once the PCI error has happened 
>>> MMIO registers become
>>> unaccessible for r/w as the PCI link is dead until after the PCI 
>>> link is reset by the DPC driver (see 
>>> https://www.kernel.org/doc/html/latest/PCI/pci-error-recovery.html 
>>> section 6.1.4).
>>> Your case is to try and gracefully to close the drm device once 
>>> fatal error happened, didn't you encounter errors or warnings when 
>>> accessing HW registers during any of the operations
>>> above ?
>>
>>
>> As discussed over chat, it seems aer generated with aer-inject tool 
>> just triggers kernel PCI error APIs but the device is still active so 
>> I didn't encounter any errors when accessing HW registers.
>>
>>
>> Nirmoy
>>
>>
>>>
>>> Andrey
>>>
>>>
>>>> +        break;
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>>>> +       .error_detected = amdgpu_pci_err_detected,
>>>> +};
>>>> +
>>>> +
>>>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>>>       .name = DRIVER_NAME,
>>>>       .id_table = pciidlist,
>>>> @@ -1523,10 +1576,9 @@ static struct pci_driver 
>>>> amdgpu_kms_pci_driver = {
>>>>       .remove = amdgpu_pci_remove,
>>>>       .shutdown = amdgpu_pci_shutdown,
>>>>       .driver.pm = &amdgpu_pm_ops,
>>>> +    .err_handler = &amdgpu_err_handler,
>>>>   };
>>>>   -
>>>> -
>>>>   static int __init amdgpu_init(void)
>>>>   {
>>>>       int r;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-13 15:06       ` Nirmoy
@ 2020-08-13 18:18         ` Andrey Grodzovsky
  2020-08-13 21:17           ` Luben Tuikov
  0 siblings, 1 reply; 13+ messages in thread
From: Andrey Grodzovsky @ 2020-08-13 18:18 UTC (permalink / raw)
  To: Nirmoy, Nirmoy Das, amd-gfx; +Cc: alexander.deucher, christian.koenig


On 8/13/20 11:06 AM, Nirmoy wrote:
>
> On 8/13/20 3:38 PM, Andrey Grodzovsky wrote:
>>
>> On 8/13/20 7:09 AM, Nirmoy wrote:
>>>
>>> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>>>>
>>>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>>>>> This patch will ignore non-fatal errors and try to
>>>>> stop amdgpu's sw stack on fatal errors.
>>>>>
>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>>>>>   1 file changed, 54 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> index c1219af2e7d6..2b9ede3000ee 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>> @@ -35,6 +35,7 @@
>>>>>   #include <linux/pm_runtime.h>
>>>>>   #include <linux/vga_switcheroo.h>
>>>>>   #include <drm/drm_probe_helper.h>
>>>>> +#include <drm/drm_atomic_helper.h>
>>>>>   #include <linux/mmu_notifier.h>
>>>>>     #include "amdgpu.h"
>>>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>>>>       .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>>>>   };
>>>>>   +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
>>>>> +                        pci_channel_state_t state)
>>>>> +{
>>>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>>>>> +    struct amdgpu_device *adev = dev->dev_private;
>>>>> +    int i;
>>>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>>>>> +
>>>>> +    switch (state) {
>>>>> +    case pci_channel_io_normal:
>>>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>>>>> +        break;
>>>>> +    default:
>>>>> +        /* Disable power management */
>>>>> +        adev->runpm = 0;
>>>>> +        /* Suspend all IO operations */
>>>>> +        amdgpu_fbdev_set_suspend(adev, 1);
>>>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>>>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>>>> +
>>>>> +            if (!ring || !ring->sched.thread)
>>>>> +                continue;
>>>>> +
>>>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>>>>
>>>>
>>>> You need to call drm_sched_stop first before calling this
>>>>
>>>>> +        }
>>>>> +
>>>>> +        if (adev->mode_info.mode_config_initialized) {
>>>>> +            if (!amdgpu_device_has_dc_support(adev))
>>>>> + drm_helper_force_disable_all(adev->ddev);
>>>>> +            else
>>>>> + drm_atomic_helper_shutdown(adev->ddev);
>>>>> +        }
>>>>> +
>>>>> +        amdgpu_fence_driver_fini(adev);
>>>>> +        amdgpu_fbdev_fini(adev);
>>>>> +        /* Try to close drm device to stop applications
>>>>> +         * from opening dri files for further IO operations.
>>>>> +         * TODO: This will throw warning as ttm is not
>>>>> +         * cleaned perperly */
>>>>> +        drm_dev_fini(dev);
>>>>
>>>>
>>>> I think user mode applications might still hold reference to the drm device 
>>>> through through drm_dev_get either by directly opening
>>>> the device file or indirectly through importing DMA buff, if so when the 
>>>> last of them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
>>>> might get called again causing use after free e.t.c issues. Maybe better to 
>>>> call here drm_dev_put then and so drm_dev_fini will get called when this
>>>> last user client releases his reference.
>>>
>>>
>>> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) never 
>>> gets terminated after the AER error and drm files remains active. Simple cat 
>>> on dri files
>>>
>>> goes though amdgpu and spits out more errors.
>>
>>
>> What happens if you kill the window manager after you closed drm device with 
>> your original code applied ? I would expect drm_dev_fini to be called again
>> for the reason i explained above and this would obviously would be wrong to 
>> happen.
>
> Hi Andrey,
>
>
> hmm I quickly tried that, Kernel crashed and later rebooted after sometime. I 
> don't have a serial console to check logs and there was no logs afterwards in 
> journalctl.
>
> drm_dev_put() had similar behavior, kernel/machine was inaccessible over ssh.
>
>
> Did you face same behavior while testing gpu hotplug ?
>
>
> Nirmoy


Yea, in my case device sysfs structure was removed on pci_remove while when last 
user client dropped reference and this led
to drm_dev_fini to be called there were more sysfs entries removal there which 
lead to a crash. But here i don't think the sysfs for drm_device
is removed because the device is not extracted...

Andrey


>
>
>>
>> Andrey
>>
>>
>>>
>>>
>>>>
>>>> Also a general question - in my work on DPC recovery feature which tries to 
>>>> recover after PCIe error - once the PCI error has happened MMIO registers 
>>>> become
>>>> unaccessible for r/w as the PCI link is dead until after the PCI link is 
>>>> reset by the DPC driver (see 
>>>> https://www.kernel.org/doc/html/latest/PCI/pci-error-recovery.html section 
>>>> 6.1.4).
>>>> Your case is to try and gracefully to close the drm device once fatal error 
>>>> happened, didn't you encounter errors or warnings when accessing HW 
>>>> registers during any of the operations
>>>> above ?
>>>
>>>
>>> As discussed over chat, it seems aer generated with aer-inject tool just 
>>> triggers kernel PCI error APIs but the device is still active so I didn't 
>>> encounter any errors when accessing HW registers.
>>>
>>>
>>> Nirmoy
>>>
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>> +        break;
>>>>> +    }
>>>>> +
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>>>>> +       .error_detected = amdgpu_pci_err_detected,
>>>>> +};
>>>>> +
>>>>> +
>>>>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>       .name = DRIVER_NAME,
>>>>>       .id_table = pciidlist,
>>>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>       .remove = amdgpu_pci_remove,
>>>>>       .shutdown = amdgpu_pci_shutdown,
>>>>>       .driver.pm = &amdgpu_pm_ops,
>>>>> +    .err_handler = &amdgpu_err_handler,
>>>>>   };
>>>>>   -
>>>>> -
>>>>>   static int __init amdgpu_init(void)
>>>>>   {
>>>>>       int r;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-13 18:18         ` Andrey Grodzovsky
@ 2020-08-13 21:17           ` Luben Tuikov
  2020-08-13 21:30             ` Luben Tuikov
  2020-08-14 15:23             ` Nirmoy
  0 siblings, 2 replies; 13+ messages in thread
From: Luben Tuikov @ 2020-08-13 21:17 UTC (permalink / raw)
  To: Andrey Grodzovsky, Nirmoy, Nirmoy Das, amd-gfx
  Cc: alexander.deucher, christian.koenig

I support having AER handling.

However, I feel it should be offloaded to the DRM layer.
The PCI driver gets the AER callback and immediately
offloads into DRM, as "return drm_aer_recover(dev); }".
The DRM layer does a top-down approach into the error
recovery procedure.

The PCI device driver provides (would/should?) a capability
(at registration) which the DRM layer would inspect and
subsequently call into the PCI driver's low-level methods
to recover the error or to reset the device.

But it should be a top-down approach. I believe the thread
below has somehow hinted at this.

The implementation below boils down to:

	If recoverable error, all is good.
	If unrecoverable error, then
		disable power management,
		suspend I/O operations,
		cancel pending work,
		call into PCI driver to clear
			any state it keeps,
		call into PCI driver to reset display control.
		Etc.

And this regime could be performed by DRM.

And as you can see now, the function implemented below,
*calls into* (that's the key here!) DRM, and it should be
the other way around--the DRM should call into the PCI driver,
after the PCI driver's callback immediately calls into DRM,
as outlined above.

This abstraction could be expanded to more concepts of PCIe GPU drivers,
and it would scale well, beyond PCIe as a protocol for graphics.

> +static const struct pci_error_handlers amdgpu_err_handler = {

That's too generic a name for this. I'd rather add "pci" in there,

static const struct pci_error_handlers amdgpu_pci_err_handler =  {
	.element = init,
	...
};

Being a singular noun from the outset is good and this is preserved.

> +       .error_detected = amdgpu_pci_err_detected,
> +};
> +
> +
>  static struct pci_driver amdgpu_kms_pci_driver = {
>  	.name = DRIVER_NAME,
>  	.id_table = pciidlist,
> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>  	.remove = amdgpu_pci_remove,
>  	.shutdown = amdgpu_pci_shutdown,
>  	.driver.pm = &amdgpu_pm_ops,
> +	.err_handler = &amdgpu_err_handler,

".err_handler = amdgpu_pci_err_handler,"


Regards,
Luben

On 2020-08-13 2:18 p.m., Andrey Grodzovsky wrote:
> 
> On 8/13/20 11:06 AM, Nirmoy wrote:
>>
>> On 8/13/20 3:38 PM, Andrey Grodzovsky wrote:
>>>
>>> On 8/13/20 7:09 AM, Nirmoy wrote:
>>>>
>>>> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>>>>>
>>>>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>>>>>> This patch will ignore non-fatal errors and try to
>>>>>> stop amdgpu's sw stack on fatal errors.
>>>>>>
>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>>>>>>   1 file changed, 54 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>> index c1219af2e7d6..2b9ede3000ee 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>> @@ -35,6 +35,7 @@
>>>>>>   #include <linux/pm_runtime.h>
>>>>>>   #include <linux/vga_switcheroo.h>
>>>>>>   #include <drm/drm_probe_helper.h>
>>>>>> +#include <drm/drm_atomic_helper.h>
>>>>>>   #include <linux/mmu_notifier.h>
>>>>>>     #include "amdgpu.h"
>>>>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>>>>>       .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>>>>>   };
>>>>>>   +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
>>>>>> +                        pci_channel_state_t state)
>>>>>> +{
>>>>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>>>>>> +    struct amdgpu_device *adev = dev->dev_private;
>>>>>> +    int i;
>>>>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>>>>>> +
>>>>>> +    switch (state) {
>>>>>> +    case pci_channel_io_normal:
>>>>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>>>>>> +        break;
>>>>>> +    default:
>>>>>> +        /* Disable power management */
>>>>>> +        adev->runpm = 0;
>>>>>> +        /* Suspend all IO operations */
>>>>>> +        amdgpu_fbdev_set_suspend(adev, 1);
>>>>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>>>>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>>>>> +
>>>>>> +            if (!ring || !ring->sched.thread)
>>>>>> +                continue;
>>>>>> +
>>>>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>>>>>
>>>>>
>>>>> You need to call drm_sched_stop first before calling this
>>>>>
>>>>>> +        }
>>>>>> +
>>>>>> +        if (adev->mode_info.mode_config_initialized) {
>>>>>> +            if (!amdgpu_device_has_dc_support(adev))
>>>>>> + drm_helper_force_disable_all(adev->ddev);
>>>>>> +            else
>>>>>> + drm_atomic_helper_shutdown(adev->ddev);
>>>>>> +        }
>>>>>> +
>>>>>> +        amdgpu_fence_driver_fini(adev);
>>>>>> +        amdgpu_fbdev_fini(adev);
>>>>>> +        /* Try to close drm device to stop applications
>>>>>> +         * from opening dri files for further IO operations.
>>>>>> +         * TODO: This will throw warning as ttm is not
>>>>>> +         * cleaned perperly */
>>>>>> +        drm_dev_fini(dev);
>>>>>
>>>>>
>>>>> I think user mode applications might still hold reference to the drm device 
>>>>> through through drm_dev_get either by directly opening
>>>>> the device file or indirectly through importing DMA buff, if so when the 
>>>>> last of them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
>>>>> might get called again causing use after free e.t.c issues. Maybe better to 
>>>>> call here drm_dev_put then and so drm_dev_fini will get called when this
>>>>> last user client releases his reference.
>>>>
>>>>
>>>> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) never 
>>>> gets terminated after the AER error and drm files remains active. Simple cat 
>>>> on dri files
>>>>
>>>> goes though amdgpu and spits out more errors.
>>>
>>>
>>> What happens if you kill the window manager after you closed drm device with 
>>> your original code applied ? I would expect drm_dev_fini to be called again
>>> for the reason i explained above and this would obviously would be wrong to 
>>> happen.
>>
>> Hi Andrey,
>>
>>
>> hmm I quickly tried that, Kernel crashed and later rebooted after sometime. I 
>> don't have a serial console to check logs and there was no logs afterwards in 
>> journalctl.
>>
>> drm_dev_put() had similar behavior, kernel/machine was inaccessible over ssh.
>>
>>
>> Did you face same behavior while testing gpu hotplug ?
>>
>>
>> Nirmoy
> 
> 
> Yea, in my case device sysfs structure was removed on pci_remove while when last 
> user client dropped reference and this led
> to drm_dev_fini to be called there were more sysfs entries removal there which 
> lead to a crash. But here i don't think the sysfs for drm_device
> is removed because the device is not extracted...
> 
> Andrey
> 
> 
>>
>>
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>
>>>>>
>>>>> Also a general question - in my work on DPC recovery feature which tries to 
>>>>> recover after PCIe error - once the PCI error has happened MMIO registers 
>>>>> become
>>>>> unaccessible for r/w as the PCI link is dead until after the PCI link is 
>>>>> reset by the DPC driver (see 
>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.kernel.org%2Fdoc%2Fhtml%2Flatest%2FPCI%2Fpci-error-recovery.html&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=ukcNKOfdzIo7CdKr0yq78vLgqKkzHtqa%2FangK7OEYsA%3D&amp;reserved=0 section 
>>>>> 6.1.4).
>>>>> Your case is to try and gracefully to close the drm device once fatal error 
>>>>> happened, didn't you encounter errors or warnings when accessing HW 
>>>>> registers during any of the operations
>>>>> above ?
>>>>
>>>>
>>>> As discussed over chat, it seems aer generated with aer-inject tool just 
>>>> triggers kernel PCI error APIs but the device is still active so I didn't 
>>>> encounter any errors when accessing HW registers.
>>>>
>>>>
>>>> Nirmoy
>>>>
>>>>
>>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>> +        break;
>>>>>> +    }
>>>>>> +
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>>>>>> +       .error_detected = amdgpu_pci_err_detected,
>>>>>> +};
>>>>>> +
>>>>>> +
>>>>>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>       .name = DRIVER_NAME,
>>>>>>       .id_table = pciidlist,
>>>>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>       .remove = amdgpu_pci_remove,
>>>>>>       .shutdown = amdgpu_pci_shutdown,
>>>>>>       .driver.pm = &amdgpu_pm_ops,
>>>>>> +    .err_handler = &amdgpu_err_handler,
>>>>>>   };
>>>>>>   -
>>>>>> -
>>>>>>   static int __init amdgpu_init(void)
>>>>>>   {
>>>>>>       int r;
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=hUiqeBUx%2BhXqgL21ewNbPBP0WQp1i66av5CDeg9CF38%3D&amp;reserved=0
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-13 21:17           ` Luben Tuikov
@ 2020-08-13 21:30             ` Luben Tuikov
  2020-08-14 15:23             ` Nirmoy
  1 sibling, 0 replies; 13+ messages in thread
From: Luben Tuikov @ 2020-08-13 21:30 UTC (permalink / raw)
  To: Andrey Grodzovsky, Nirmoy, Nirmoy Das, amd-gfx
  Cc: alexander.deucher, christian.koenig

One more nitpick:

> +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
> +						pci_channel_state_t state)
> +{
> +	struct drm_device *dev = pci_get_drvdata(pdev);

That's the name of a state, a state of "pci error detected".
I'd rather call it "amdgpu_pci_err_recovery()" as this
is what this function would try to *do*.

Regards,
Luben

On 2020-08-13 5:17 p.m., Luben Tuikov wrote:
> I support having AER handling.
> 
> However, I feel it should be offloaded to the DRM layer.
> The PCI driver gets the AER callback and immediately
> offloads into DRM, as "return drm_aer_recover(dev); }".
> The DRM layer does a top-down approach into the error
> recovery procedure.
> 
> The PCI device driver provides (would/should?) a capability
> (at registration) which the DRM layer would inspect and
> subsequently call into the PCI driver's low-level methods
> to recover the error or to reset the device.
> 
> But it should be a top-down approach. I believe the thread
> below has somehow hinted at this.
> 
> The implementation below boils down to:
> 
> 	If recoverable error, all is good.
> 	If unrecoverable error, then
> 		disable power management,
> 		suspend I/O operations,
> 		cancel pending work,
> 		call into PCI driver to clear
> 			any state it keeps,
> 		call into PCI driver to reset display control.
> 		Etc.
> 
> And this regime could be performed by DRM.
> 
> And as you can see now, the function implemented below,
> *calls into* (that's the key here!) DRM, and it should be
> the other way around--the DRM should call into the PCI driver,
> after the PCI driver's callback immediately calls into DRM,
> as outlined above.
> 
> This abstraction could be expanded to more concepts of PCIe GPU drivers,
> and it would scale well, beyond PCIe as a protocol for graphics.
> 
>> +static const struct pci_error_handlers amdgpu_err_handler = {
> 
> That's too generic a name for this. I'd rather add "pci" in there,
> 
> static const struct pci_error_handlers amdgpu_pci_err_handler =  {
> 	.element = init,
> 	...
> };
> 
> Being a singular noun from the outset is good and this is preserved.
> 
>> +       .error_detected = amdgpu_pci_err_detected,
>> +};
>> +
>> +
>>  static struct pci_driver amdgpu_kms_pci_driver = {
>>  	.name = DRIVER_NAME,
>>  	.id_table = pciidlist,
>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>  	.remove = amdgpu_pci_remove,
>>  	.shutdown = amdgpu_pci_shutdown,
>>  	.driver.pm = &amdgpu_pm_ops,
>> +	.err_handler = &amdgpu_err_handler,
> 
> ".err_handler = amdgpu_pci_err_handler,"
> 
> 
> Regards,
> Luben
> 
> On 2020-08-13 2:18 p.m., Andrey Grodzovsky wrote:
>>
>> On 8/13/20 11:06 AM, Nirmoy wrote:
>>>
>>> On 8/13/20 3:38 PM, Andrey Grodzovsky wrote:
>>>>
>>>> On 8/13/20 7:09 AM, Nirmoy wrote:
>>>>>
>>>>> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>>>>>>
>>>>>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>>>>>>> This patch will ignore non-fatal errors and try to
>>>>>>> stop amdgpu's sw stack on fatal errors.
>>>>>>>
>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>>>>>>>   1 file changed, 54 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>> index c1219af2e7d6..2b9ede3000ee 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>> @@ -35,6 +35,7 @@
>>>>>>>   #include <linux/pm_runtime.h>
>>>>>>>   #include <linux/vga_switcheroo.h>
>>>>>>>   #include <drm/drm_probe_helper.h>
>>>>>>> +#include <drm/drm_atomic_helper.h>
>>>>>>>   #include <linux/mmu_notifier.h>
>>>>>>>     #include "amdgpu.h"
>>>>>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>>>>>>       .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>>>>>>   };
>>>>>>>   +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
>>>>>>> +                        pci_channel_state_t state)
>>>>>>> +{
>>>>>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>>>>>>> +    struct amdgpu_device *adev = dev->dev_private;
>>>>>>> +    int i;
>>>>>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>>>>>>> +
>>>>>>> +    switch (state) {
>>>>>>> +    case pci_channel_io_normal:
>>>>>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>>>>>>> +        break;
>>>>>>> +    default:
>>>>>>> +        /* Disable power management */
>>>>>>> +        adev->runpm = 0;
>>>>>>> +        /* Suspend all IO operations */
>>>>>>> +        amdgpu_fbdev_set_suspend(adev, 1);
>>>>>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>>>>>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>>>>>> +
>>>>>>> +            if (!ring || !ring->sched.thread)
>>>>>>> +                continue;
>>>>>>> +
>>>>>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>>>>>>
>>>>>>
>>>>>> You need to call drm_sched_stop first before calling this
>>>>>>
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        if (adev->mode_info.mode_config_initialized) {
>>>>>>> +            if (!amdgpu_device_has_dc_support(adev))
>>>>>>> + drm_helper_force_disable_all(adev->ddev);
>>>>>>> +            else
>>>>>>> + drm_atomic_helper_shutdown(adev->ddev);
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        amdgpu_fence_driver_fini(adev);
>>>>>>> +        amdgpu_fbdev_fini(adev);
>>>>>>> +        /* Try to close drm device to stop applications
>>>>>>> +         * from opening dri files for further IO operations.
>>>>>>> +         * TODO: This will throw warning as ttm is not
>>>>>>> +         * cleaned perperly */
>>>>>>> +        drm_dev_fini(dev);
>>>>>>
>>>>>>
>>>>>> I think user mode applications might still hold reference to the drm device 
>>>>>> through through drm_dev_get either by directly opening
>>>>>> the device file or indirectly through importing DMA buff, if so when the 
>>>>>> last of them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
>>>>>> might get called again causing use after free e.t.c issues. Maybe better to 
>>>>>> call here drm_dev_put then and so drm_dev_fini will get called when this
>>>>>> last user client releases his reference.
>>>>>
>>>>>
>>>>> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) never 
>>>>> gets terminated after the AER error and drm files remains active. Simple cat 
>>>>> on dri files
>>>>>
>>>>> goes though amdgpu and spits out more errors.
>>>>
>>>>
>>>> What happens if you kill the window manager after you closed drm device with 
>>>> your original code applied ? I would expect drm_dev_fini to be called again
>>>> for the reason i explained above and this would obviously would be wrong to 
>>>> happen.
>>>
>>> Hi Andrey,
>>>
>>>
>>> hmm I quickly tried that, Kernel crashed and later rebooted after sometime. I 
>>> don't have a serial console to check logs and there was no logs afterwards in 
>>> journalctl.
>>>
>>> drm_dev_put() had similar behavior, kernel/machine was inaccessible over ssh.
>>>
>>>
>>> Did you face same behavior while testing gpu hotplug ?
>>>
>>>
>>> Nirmoy
>>
>>
>> Yea, in my case device sysfs structure was removed on pci_remove while when last 
>> user client dropped reference and this led
>> to drm_dev_fini to be called there were more sysfs entries removal there which 
>> lead to a crash. But here i don't think the sysfs for drm_device
>> is removed because the device is not extracted...
>>
>> Andrey
>>
>>
>>>
>>>
>>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>
>>>>>>
>>>>>> Also a general question - in my work on DPC recovery feature which tries to 
>>>>>> recover after PCIe error - once the PCI error has happened MMIO registers 
>>>>>> become
>>>>>> unaccessible for r/w as the PCI link is dead until after the PCI link is 
>>>>>> reset by the DPC driver (see 
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.kernel.org%2Fdoc%2Fhtml%2Flatest%2FPCI%2Fpci-error-recovery.html&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7Cd66383b7a810499985f908d83fce51fa%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329503445622039&amp;sdata=kiBeWFLRjkgC0BJd1KBQOwwprAMq3JbtSPJyBozG6ZA%3D&amp;reserved=0 section 
>>>>>> 6.1.4).
>>>>>> Your case is to try and gracefully to close the drm device once fatal error 
>>>>>> happened, didn't you encounter errors or warnings when accessing HW 
>>>>>> registers during any of the operations
>>>>>> above ?
>>>>>
>>>>>
>>>>> As discussed over chat, it seems aer generated with aer-inject tool just 
>>>>> triggers kernel PCI error APIs but the device is still active so I didn't 
>>>>> encounter any errors when accessing HW registers.
>>>>>
>>>>>
>>>>> Nirmoy
>>>>>
>>>>>
>>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> +        break;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return ret;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>>>>>>> +       .error_detected = amdgpu_pci_err_detected,
>>>>>>> +};
>>>>>>> +
>>>>>>> +
>>>>>>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>>       .name = DRIVER_NAME,
>>>>>>>       .id_table = pciidlist,
>>>>>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>>       .remove = amdgpu_pci_remove,
>>>>>>>       .shutdown = amdgpu_pci_shutdown,
>>>>>>>       .driver.pm = &amdgpu_pm_ops,
>>>>>>> +    .err_handler = &amdgpu_err_handler,
>>>>>>>   };
>>>>>>>   -
>>>>>>> -
>>>>>>>   static int __init amdgpu_init(void)
>>>>>>>   {
>>>>>>>       int r;
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7Cd66383b7a810499985f908d83fce51fa%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329503445627016&amp;sdata=T19Zg1sHeIzC1AMqQJEHeaiFe52tn7KHKPNqgloKVCY%3D&amp;reserved=0
>>
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7Cd66383b7a810499985f908d83fce51fa%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329503445627016&amp;sdata=T19Zg1sHeIzC1AMqQJEHeaiFe52tn7KHKPNqgloKVCY%3D&amp;reserved=0
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-13 21:17           ` Luben Tuikov
  2020-08-13 21:30             ` Luben Tuikov
@ 2020-08-14 15:23             ` Nirmoy
  2020-08-14 19:52               ` Luben Tuikov
  1 sibling, 1 reply; 13+ messages in thread
From: Nirmoy @ 2020-08-14 15:23 UTC (permalink / raw)
  To: Luben Tuikov, Andrey Grodzovsky, Nirmoy Das, amd-gfx
  Cc: alexander.deucher, christian.koenig


On 8/13/20 11:17 PM, Luben Tuikov wrote:
> I support having AER handling.
>
> However, I feel it should be offloaded to the DRM layer.
> The PCI driver gets the AER callback and immediately
> offloads into DRM, as "return drm_aer_recover(dev); }".
> The DRM layer does a top-down approach into the error
> recovery procedure.
>
> The PCI device driver provides (would/should?) a capability
> (at registration) which the DRM layer would inspect and
> subsequently call into the PCI driver's low-level methods
> to recover the error or to reset the device.
>
> But it should be a top-down approach. I believe the thread
> below has somehow hinted at this.
>
> The implementation below boils down to:
>
> 	If recoverable error, all is good.
> 	If unrecoverable error, then
> 		disable power management,
> 		suspend I/O operations,
> 		cancel pending work,
> 		call into PCI driver to clear
> 			any state it keeps,
> 		call into PCI driver to reset display control.
> 		Etc.
>
> And this regime could be performed by DRM.
>
> And as you can see now, the function implemented below,
> *calls into* (that's the key here!) DRM, and it should be
> the other way around--the DRM should call into the PCI driver,
> after the PCI driver's callback immediately calls into DRM,
> as outlined above.
>
> This abstraction could be expanded to more concepts of PCIe GPU drivers,
> and it would scale well, beyond PCIe as a protocol for graphics.

If drm handles pci error callbacks then it should also handle power 
management

because power management also calls into drm in very similar way. I 
think these are very

low device level tasks for drm.


>> +static const struct pci_error_handlers amdgpu_err_handler = {
> That's too generic a name for this. I'd rather add "pci" in there,
>
> static const struct pci_error_handlers amdgpu_pci_err_handler =  {


True, thanks for the name suggestion.


Nirmoy


> 	.element = init,
> 	...
> };
>
> Being a singular noun from the outset is good and this is preserved.
>
>> +       .error_detected = amdgpu_pci_err_detected,
>> +};
>> +
>> +
>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>   	.name = DRIVER_NAME,
>>   	.id_table = pciidlist,
>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>   	.remove = amdgpu_pci_remove,
>>   	.shutdown = amdgpu_pci_shutdown,
>>   	.driver.pm = &amdgpu_pm_ops,
>> +	.err_handler = &amdgpu_err_handler,
> ".err_handler = amdgpu_pci_err_handler,"
>
>
> Regards,
> Luben
>
> On 2020-08-13 2:18 p.m., Andrey Grodzovsky wrote:
>> On 8/13/20 11:06 AM, Nirmoy wrote:
>>> On 8/13/20 3:38 PM, Andrey Grodzovsky wrote:
>>>> On 8/13/20 7:09 AM, Nirmoy wrote:
>>>>> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>>>>>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>>>>>>> This patch will ignore non-fatal errors and try to
>>>>>>> stop amdgpu's sw stack on fatal errors.
>>>>>>>
>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>>> ---
>>>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>>>>>>>    1 file changed, 54 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>> index c1219af2e7d6..2b9ede3000ee 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>> @@ -35,6 +35,7 @@
>>>>>>>    #include <linux/pm_runtime.h>
>>>>>>>    #include <linux/vga_switcheroo.h>
>>>>>>>    #include <drm/drm_probe_helper.h>
>>>>>>> +#include <drm/drm_atomic_helper.h>
>>>>>>>    #include <linux/mmu_notifier.h>
>>>>>>>      #include "amdgpu.h"
>>>>>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>>>>>>        .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>>>>>>    };
>>>>>>>    +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
>>>>>>> +                        pci_channel_state_t state)
>>>>>>> +{
>>>>>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>>>>>>> +    struct amdgpu_device *adev = dev->dev_private;
>>>>>>> +    int i;
>>>>>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>>>>>>> +
>>>>>>> +    switch (state) {
>>>>>>> +    case pci_channel_io_normal:
>>>>>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>>>>>>> +        break;
>>>>>>> +    default:
>>>>>>> +        /* Disable power management */
>>>>>>> +        adev->runpm = 0;
>>>>>>> +        /* Suspend all IO operations */
>>>>>>> +        amdgpu_fbdev_set_suspend(adev, 1);
>>>>>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>>>>>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>>>>>> +
>>>>>>> +            if (!ring || !ring->sched.thread)
>>>>>>> +                continue;
>>>>>>> +
>>>>>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>>>>>>
>>>>>> You need to call drm_sched_stop first before calling this
>>>>>>
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        if (adev->mode_info.mode_config_initialized) {
>>>>>>> +            if (!amdgpu_device_has_dc_support(adev))
>>>>>>> + drm_helper_force_disable_all(adev->ddev);
>>>>>>> +            else
>>>>>>> + drm_atomic_helper_shutdown(adev->ddev);
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        amdgpu_fence_driver_fini(adev);
>>>>>>> +        amdgpu_fbdev_fini(adev);
>>>>>>> +        /* Try to close drm device to stop applications
>>>>>>> +         * from opening dri files for further IO operations.
>>>>>>> +         * TODO: This will throw warning as ttm is not
>>>>>>> +         * cleaned perperly */
>>>>>>> +        drm_dev_fini(dev);
>>>>>>
>>>>>> I think user mode applications might still hold reference to the drm device
>>>>>> through through drm_dev_get either by directly opening
>>>>>> the device file or indirectly through importing DMA buff, if so when the
>>>>>> last of them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
>>>>>> might get called again causing use after free e.t.c issues. Maybe better to
>>>>>> call here drm_dev_put then and so drm_dev_fini will get called when this
>>>>>> last user client releases his reference.
>>>>>
>>>>> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) never
>>>>> gets terminated after the AER error and drm files remains active. Simple cat
>>>>> on dri files
>>>>>
>>>>> goes though amdgpu and spits out more errors.
>>>>
>>>> What happens if you kill the window manager after you closed drm device with
>>>> your original code applied ? I would expect drm_dev_fini to be called again
>>>> for the reason i explained above and this would obviously would be wrong to
>>>> happen.
>>> Hi Andrey,
>>>
>>>
>>> hmm I quickly tried that, Kernel crashed and later rebooted after sometime. I
>>> don't have a serial console to check logs and there was no logs afterwards in
>>> journalctl.
>>>
>>> drm_dev_put() had similar behavior, kernel/machine was inaccessible over ssh.
>>>
>>>
>>> Did you face same behavior while testing gpu hotplug ?
>>>
>>>
>>> Nirmoy
>>
>> Yea, in my case device sysfs structure was removed on pci_remove while when last
>> user client dropped reference and this led
>> to drm_dev_fini to be called there were more sysfs entries removal there which
>> lead to a crash. But here i don't think the sysfs for drm_device
>> is removed because the device is not extracted...
>>
>> Andrey
>>
>>
>>>
>>>> Andrey
>>>>
>>>>
>>>>>
>>>>>> Also a general question - in my work on DPC recovery feature which tries to
>>>>>> recover after PCIe error - once the PCI error has happened MMIO registers
>>>>>> become
>>>>>> unaccessible for r/w as the PCI link is dead until after the PCI link is
>>>>>> reset by the DPC driver (see
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.kernel.org%2Fdoc%2Fhtml%2Flatest%2FPCI%2Fpci-error-recovery.html&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=ukcNKOfdzIo7CdKr0yq78vLgqKkzHtqa%2FangK7OEYsA%3D&amp;reserved=0 section
>>>>>> 6.1.4).
>>>>>> Your case is to try and gracefully to close the drm device once fatal error
>>>>>> happened, didn't you encounter errors or warnings when accessing HW
>>>>>> registers during any of the operations
>>>>>> above ?
>>>>>
>>>>> As discussed over chat, it seems aer generated with aer-inject tool just
>>>>> triggers kernel PCI error APIs but the device is still active so I didn't
>>>>> encounter any errors when accessing HW registers.
>>>>>
>>>>>
>>>>> Nirmoy
>>>>>
>>>>>
>>>>>> Andrey
>>>>>>
>>>>>>
>>>>>>> +        break;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return ret;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>>>>>>> +       .error_detected = amdgpu_pci_err_detected,
>>>>>>> +};
>>>>>>> +
>>>>>>> +
>>>>>>>    static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>>        .name = DRIVER_NAME,
>>>>>>>        .id_table = pciidlist,
>>>>>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>>        .remove = amdgpu_pci_remove,
>>>>>>>        .shutdown = amdgpu_pci_shutdown,
>>>>>>>        .driver.pm = &amdgpu_pm_ops,
>>>>>>> +    .err_handler = &amdgpu_err_handler,
>>>>>>>    };
>>>>>>>    -
>>>>>>> -
>>>>>>>    static int __init amdgpu_init(void)
>>>>>>>    {
>>>>>>>        int r;
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=hUiqeBUx%2BhXqgL21ewNbPBP0WQp1i66av5CDeg9CF38%3D&amp;reserved=0
>>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-14 15:23             ` Nirmoy
@ 2020-08-14 19:52               ` Luben Tuikov
  2020-08-14 20:10                 ` Alex Deucher
  0 siblings, 1 reply; 13+ messages in thread
From: Luben Tuikov @ 2020-08-14 19:52 UTC (permalink / raw)
  To: Nirmoy, Andrey Grodzovsky, Nirmoy Das, amd-gfx
  Cc: alexander.deucher, christian.koenig

On 2020-08-14 11:23 a.m., Nirmoy wrote:
> 
> On 8/13/20 11:17 PM, Luben Tuikov wrote:
>> I support having AER handling.
>>
>> However, I feel it should be offloaded to the DRM layer.
>> The PCI driver gets the AER callback and immediately
>> offloads into DRM, as "return drm_aer_recover(dev); }".
>> The DRM layer does a top-down approach into the error
>> recovery procedure.
>>
>> The PCI device driver provides (would/should?) a capability
>> (at registration) which the DRM layer would inspect and
>> subsequently call into the PCI driver's low-level methods
>> to recover the error or to reset the device.
>>
>> But it should be a top-down approach. I believe the thread
>> below has somehow hinted at this.
>>
>> The implementation below boils down to:
>>
>> 	If recoverable error, all is good.
>> 	If unrecoverable error, then
>> 		disable power management,
>> 		suspend I/O operations,
>> 		cancel pending work,
>> 		call into PCI driver to clear
>> 			any state it keeps,
>> 		call into PCI driver to reset display control.
>> 		Etc.
>>
>> And this regime could be performed by DRM.
>>
>> And as you can see now, the function implemented below,
>> *calls into* (that's the key here!) DRM, and it should be
>> the other way around--the DRM should call into the PCI driver,
>> after the PCI driver's callback immediately calls into DRM,
>> as outlined above.
>>
>> This abstraction could be expanded to more concepts of PCIe GPU drivers,
>> and it would scale well, beyond PCIe as a protocol for graphics.
> 
> If drm handles pci error callbacks then it should also handle power 
> management

Yes. LLDDs should have a method for DRM to call to control power
management. For instance the first thing a DRM unifying layer would
do is suspend issuing tasks to the LLDD, then recall (pause?) issued tasks,
do some memory management housekeeping related to the LLDD and then
invoke power management.

Once DRM is aware of power management it can better control
memory and task dependencies.

> 
> because power management also calls into drm in very similar way. I 
> think these are very
> 
> low device level tasks for drm.

I disagree and think that it is not "very low device level tasks".
Look at the regime posted above. All those directives should/could?
be known to DRM and LLDDs should have entries for DRM to call
into.

I see DRM as more of a unifying layer (perhaps long term), as opposed
to a *library* which LLDDs call into. Then LLDDs would provide an interface
to the hardware. This will help us avoid many of the deadlocks and
synchronization issues which were outlined in a recent memory presentation.
It will also help us avoid code redundancy in LLDDs.

Regards,
Luben

> 
> 
>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>> That's too generic a name for this. I'd rather add "pci" in there,
>>
>> static const struct pci_error_handlers amdgpu_pci_err_handler =  {
> 
> 
> True, thanks for the name suggestion.
> 
> 
> Nirmoy
> 
> 
>> 	.element = init,
>> 	...
>> };
>>
>> Being a singular noun from the outset is good and this is preserved.
>>
>>> +       .error_detected = amdgpu_pci_err_detected,
>>> +};
>>> +
>>> +
>>>   static struct pci_driver amdgpu_kms_pci_driver = {
>>>   	.name = DRIVER_NAME,
>>>   	.id_table = pciidlist,
>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>>   	.remove = amdgpu_pci_remove,
>>>   	.shutdown = amdgpu_pci_shutdown,
>>>   	.driver.pm = &amdgpu_pm_ops,
>>> +	.err_handler = &amdgpu_err_handler,
>> ".err_handler = amdgpu_pci_err_handler,"
>>
>>
>> Regards,
>> Luben
>>
>> On 2020-08-13 2:18 p.m., Andrey Grodzovsky wrote:
>>> On 8/13/20 11:06 AM, Nirmoy wrote:
>>>> On 8/13/20 3:38 PM, Andrey Grodzovsky wrote:
>>>>> On 8/13/20 7:09 AM, Nirmoy wrote:
>>>>>> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
>>>>>>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
>>>>>>>> This patch will ignore non-fatal errors and try to
>>>>>>>> stop amdgpu's sw stack on fatal errors.
>>>>>>>>
>>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>>>> ---
>>>>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
>>>>>>>>    1 file changed, 54 insertions(+), 2 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>>> index c1219af2e7d6..2b9ede3000ee 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>>>>>> @@ -35,6 +35,7 @@
>>>>>>>>    #include <linux/pm_runtime.h>
>>>>>>>>    #include <linux/vga_switcheroo.h>
>>>>>>>>    #include <drm/drm_probe_helper.h>
>>>>>>>> +#include <drm/drm_atomic_helper.h>
>>>>>>>>    #include <linux/mmu_notifier.h>
>>>>>>>>      #include "amdgpu.h"
>>>>>>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
>>>>>>>>        .patchlevel = KMS_DRIVER_PATCHLEVEL,
>>>>>>>>    };
>>>>>>>>    +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
>>>>>>>> +                        pci_channel_state_t state)
>>>>>>>> +{
>>>>>>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
>>>>>>>> +    struct amdgpu_device *adev = dev->dev_private;
>>>>>>>> +    int i;
>>>>>>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
>>>>>>>> +
>>>>>>>> +    switch (state) {
>>>>>>>> +    case pci_channel_io_normal:
>>>>>>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
>>>>>>>> +        break;
>>>>>>>> +    default:
>>>>>>>> +        /* Disable power management */
>>>>>>>> +        adev->runpm = 0;
>>>>>>>> +        /* Suspend all IO operations */
>>>>>>>> +        amdgpu_fbdev_set_suspend(adev, 1);
>>>>>>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
>>>>>>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>>>>>>> +            struct amdgpu_ring *ring = adev->rings[i];
>>>>>>>> +
>>>>>>>> +            if (!ring || !ring->sched.thread)
>>>>>>>> +                continue;
>>>>>>>> +
>>>>>>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
>>>>>>>
>>>>>>> You need to call drm_sched_stop first before calling this
>>>>>>>
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        if (adev->mode_info.mode_config_initialized) {
>>>>>>>> +            if (!amdgpu_device_has_dc_support(adev))
>>>>>>>> + drm_helper_force_disable_all(adev->ddev);
>>>>>>>> +            else
>>>>>>>> + drm_atomic_helper_shutdown(adev->ddev);
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        amdgpu_fence_driver_fini(adev);
>>>>>>>> +        amdgpu_fbdev_fini(adev);
>>>>>>>> +        /* Try to close drm device to stop applications
>>>>>>>> +         * from opening dri files for further IO operations.
>>>>>>>> +         * TODO: This will throw warning as ttm is not
>>>>>>>> +         * cleaned perperly */
>>>>>>>> +        drm_dev_fini(dev);
>>>>>>>
>>>>>>> I think user mode applications might still hold reference to the drm device
>>>>>>> through through drm_dev_get either by directly opening
>>>>>>> the device file or indirectly through importing DMA buff, if so when the
>>>>>>> last of them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
>>>>>>> might get called again causing use after free e.t.c issues. Maybe better to
>>>>>>> call here drm_dev_put then and so drm_dev_fini will get called when this
>>>>>>> last user client releases his reference.
>>>>>>
>>>>>> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) never
>>>>>> gets terminated after the AER error and drm files remains active. Simple cat
>>>>>> on dri files
>>>>>>
>>>>>> goes though amdgpu and spits out more errors.
>>>>>
>>>>> What happens if you kill the window manager after you closed drm device with
>>>>> your original code applied ? I would expect drm_dev_fini to be called again
>>>>> for the reason i explained above and this would obviously would be wrong to
>>>>> happen.
>>>> Hi Andrey,
>>>>
>>>>
>>>> hmm I quickly tried that, Kernel crashed and later rebooted after sometime. I
>>>> don't have a serial console to check logs and there was no logs afterwards in
>>>> journalctl.
>>>>
>>>> drm_dev_put() had similar behavior, kernel/machine was inaccessible over ssh.
>>>>
>>>>
>>>> Did you face same behavior while testing gpu hotplug ?
>>>>
>>>>
>>>> Nirmoy
>>>
>>> Yea, in my case device sysfs structure was removed on pci_remove while when last
>>> user client dropped reference and this led
>>> to drm_dev_fini to be called there were more sysfs entries removal there which
>>> lead to a crash. But here i don't think the sysfs for drm_device
>>> is removed because the device is not extracted...
>>>
>>> Andrey
>>>
>>>
>>>>
>>>>> Andrey
>>>>>
>>>>>
>>>>>>
>>>>>>> Also a general question - in my work on DPC recovery feature which tries to
>>>>>>> recover after PCIe error - once the PCI error has happened MMIO registers
>>>>>>> become
>>>>>>> unaccessible for r/w as the PCI link is dead until after the PCI link is
>>>>>>> reset by the DPC driver (see
>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.kernel.org%2Fdoc%2Fhtml%2Flatest%2FPCI%2Fpci-error-recovery.html&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=ukcNKOfdzIo7CdKr0yq78vLgqKkzHtqa%2FangK7OEYsA%3D&amp;reserved=0 section
>>>>>>> 6.1.4).
>>>>>>> Your case is to try and gracefully to close the drm device once fatal error
>>>>>>> happened, didn't you encounter errors or warnings when accessing HW
>>>>>>> registers during any of the operations
>>>>>>> above ?
>>>>>>
>>>>>> As discussed over chat, it seems aer generated with aer-inject tool just
>>>>>> triggers kernel PCI error APIs but the device is still active so I didn't
>>>>>> encounter any errors when accessing HW registers.
>>>>>>
>>>>>>
>>>>>> Nirmoy
>>>>>>
>>>>>>
>>>>>>> Andrey
>>>>>>>
>>>>>>>
>>>>>>>> +        break;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static const struct pci_error_handlers amdgpu_err_handler = {
>>>>>>>> +       .error_detected = amdgpu_pci_err_detected,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +
>>>>>>>>    static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>>>        .name = DRIVER_NAME,
>>>>>>>>        .id_table = pciidlist,
>>>>>>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
>>>>>>>>        .remove = amdgpu_pci_remove,
>>>>>>>>        .shutdown = amdgpu_pci_shutdown,
>>>>>>>>        .driver.pm = &amdgpu_pm_ops,
>>>>>>>> +    .err_handler = &amdgpu_err_handler,
>>>>>>>>    };
>>>>>>>>    -
>>>>>>>> -
>>>>>>>>    static int __init amdgpu_init(void)
>>>>>>>>    {
>>>>>>>>        int r;
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=hUiqeBUx%2BhXqgL21ewNbPBP0WQp1i66av5CDeg9CF38%3D&amp;reserved=0
>>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-14 19:52               ` Luben Tuikov
@ 2020-08-14 20:10                 ` Alex Deucher
  2020-08-14 20:20                   ` Luben Tuikov
  0 siblings, 1 reply; 13+ messages in thread
From: Alex Deucher @ 2020-08-14 20:10 UTC (permalink / raw)
  To: Luben Tuikov
  Cc: Andrey Grodzovsky, Nirmoy, amd-gfx list, Nirmoy Das, Deucher,
	Alexander, Christian Koenig

On Fri, Aug 14, 2020 at 3:52 PM Luben Tuikov <luben.tuikov@amd.com> wrote:
>
> On 2020-08-14 11:23 a.m., Nirmoy wrote:
> >
> > On 8/13/20 11:17 PM, Luben Tuikov wrote:
> >> I support having AER handling.
> >>
> >> However, I feel it should be offloaded to the DRM layer.
> >> The PCI driver gets the AER callback and immediately
> >> offloads into DRM, as "return drm_aer_recover(dev); }".
> >> The DRM layer does a top-down approach into the error
> >> recovery procedure.
> >>
> >> The PCI device driver provides (would/should?) a capability
> >> (at registration) which the DRM layer would inspect and
> >> subsequently call into the PCI driver's low-level methods
> >> to recover the error or to reset the device.
> >>
> >> But it should be a top-down approach. I believe the thread
> >> below has somehow hinted at this.
> >>
> >> The implementation below boils down to:
> >>
> >>      If recoverable error, all is good.
> >>      If unrecoverable error, then
> >>              disable power management,
> >>              suspend I/O operations,
> >>              cancel pending work,
> >>              call into PCI driver to clear
> >>                      any state it keeps,
> >>              call into PCI driver to reset display control.
> >>              Etc.
> >>
> >> And this regime could be performed by DRM.
> >>
> >> And as you can see now, the function implemented below,
> >> *calls into* (that's the key here!) DRM, and it should be
> >> the other way around--the DRM should call into the PCI driver,
> >> after the PCI driver's callback immediately calls into DRM,
> >> as outlined above.
> >>
> >> This abstraction could be expanded to more concepts of PCIe GPU drivers,
> >> and it would scale well, beyond PCIe as a protocol for graphics.
> >
> > If drm handles pci error callbacks then it should also handle power
> > management
>
> Yes. LLDDs should have a method for DRM to call to control power
> management. For instance the first thing a DRM unifying layer would
> do is suspend issuing tasks to the LLDD, then recall (pause?) issued tasks,
> do some memory management housekeeping related to the LLDD and then
> invoke power management.
>
> Once DRM is aware of power management it can better control
> memory and task dependencies.
>
> >
> > because power management also calls into drm in very similar way. I
> > think these are very
> >
> > low device level tasks for drm.
>
> I disagree and think that it is not "very low device level tasks".
> Look at the regime posted above. All those directives should/could?
> be known to DRM and LLDDs should have entries for DRM to call
> into.
>
> I see DRM as more of a unifying layer (perhaps long term), as opposed
> to a *library* which LLDDs call into. Then LLDDs would provide an interface
> to the hardware. This will help us avoid many of the deadlocks and
> synchronization issues which were outlined in a recent memory presentation.
> It will also help us avoid code redundancy in LLDDs.

We are actually trying to move away from that model.  drm started out
as a midlayer and it caused all kinds of pain.  E.g., some drm devices
are usb devices, some are pci, some are platform devices, etc.  A flow
that worked for one didn't always work well for another.  In general
we should follow the driver model for the specific device model of
your driver and then use drm helper functions to deal with the core
drm functionality.  E.g., drm shouldn't be in the business of dealing
with pci power management.  The driver should interact directly with
the pci subsystem for that.

Alex

>
> Regards,
> Luben
>
> >
> >
> >>> +static const struct pci_error_handlers amdgpu_err_handler = {
> >> That's too generic a name for this. I'd rather add "pci" in there,
> >>
> >> static const struct pci_error_handlers amdgpu_pci_err_handler =  {
> >
> >
> > True, thanks for the name suggestion.
> >
> >
> > Nirmoy
> >
> >
> >>      .element = init,
> >>      ...
> >> };
> >>
> >> Being a singular noun from the outset is good and this is preserved.
> >>
> >>> +       .error_detected = amdgpu_pci_err_detected,
> >>> +};
> >>> +
> >>> +
> >>>   static struct pci_driver amdgpu_kms_pci_driver = {
> >>>     .name = DRIVER_NAME,
> >>>     .id_table = pciidlist,
> >>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
> >>>     .remove = amdgpu_pci_remove,
> >>>     .shutdown = amdgpu_pci_shutdown,
> >>>     .driver.pm = &amdgpu_pm_ops,
> >>> +   .err_handler = &amdgpu_err_handler,
> >> ".err_handler = amdgpu_pci_err_handler,"
> >>
> >>
> >> Regards,
> >> Luben
> >>
> >> On 2020-08-13 2:18 p.m., Andrey Grodzovsky wrote:
> >>> On 8/13/20 11:06 AM, Nirmoy wrote:
> >>>> On 8/13/20 3:38 PM, Andrey Grodzovsky wrote:
> >>>>> On 8/13/20 7:09 AM, Nirmoy wrote:
> >>>>>> On 8/12/20 4:52 PM, Andrey Grodzovsky wrote:
> >>>>>>> On 8/11/20 9:30 AM, Nirmoy Das wrote:
> >>>>>>>> This patch will ignore non-fatal errors and try to
> >>>>>>>> stop amdgpu's sw stack on fatal errors.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> >>>>>>>> ---
> >>>>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++-
> >>>>>>>>    1 file changed, 54 insertions(+), 2 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>>>>>>> index c1219af2e7d6..2b9ede3000ee 100644
> >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> >>>>>>>> @@ -35,6 +35,7 @@
> >>>>>>>>    #include <linux/pm_runtime.h>
> >>>>>>>>    #include <linux/vga_switcheroo.h>
> >>>>>>>>    #include <drm/drm_probe_helper.h>
> >>>>>>>> +#include <drm/drm_atomic_helper.h>
> >>>>>>>>    #include <linux/mmu_notifier.h>
> >>>>>>>>      #include "amdgpu.h"
> >>>>>>>> @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = {
> >>>>>>>>        .patchlevel = KMS_DRIVER_PATCHLEVEL,
> >>>>>>>>    };
> >>>>>>>>    +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev,
> >>>>>>>> +                        pci_channel_state_t state)
> >>>>>>>> +{
> >>>>>>>> +    struct drm_device *dev = pci_get_drvdata(pdev);
> >>>>>>>> +    struct amdgpu_device *adev = dev->dev_private;
> >>>>>>>> +    int i;
> >>>>>>>> +    int ret = PCI_ERS_RESULT_DISCONNECT;
> >>>>>>>> +
> >>>>>>>> +    switch (state) {
> >>>>>>>> +    case pci_channel_io_normal:
> >>>>>>>> +        ret = PCI_ERS_RESULT_CAN_RECOVER;
> >>>>>>>> +        break;
> >>>>>>>> +    default:
> >>>>>>>> +        /* Disable power management */
> >>>>>>>> +        adev->runpm = 0;
> >>>>>>>> +        /* Suspend all IO operations */
> >>>>>>>> +        amdgpu_fbdev_set_suspend(adev, 1);
> >>>>>>>> + cancel_delayed_work_sync(&adev->delayed_init_work);
> >>>>>>>> +        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> >>>>>>>> +            struct amdgpu_ring *ring = adev->rings[i];
> >>>>>>>> +
> >>>>>>>> +            if (!ring || !ring->sched.thread)
> >>>>>>>> +                continue;
> >>>>>>>> +
> >>>>>>>> + amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
> >>>>>>>
> >>>>>>> You need to call drm_sched_stop first before calling this
> >>>>>>>
> >>>>>>>> +        }
> >>>>>>>> +
> >>>>>>>> +        if (adev->mode_info.mode_config_initialized) {
> >>>>>>>> +            if (!amdgpu_device_has_dc_support(adev))
> >>>>>>>> + drm_helper_force_disable_all(adev->ddev);
> >>>>>>>> +            else
> >>>>>>>> + drm_atomic_helper_shutdown(adev->ddev);
> >>>>>>>> +        }
> >>>>>>>> +
> >>>>>>>> +        amdgpu_fence_driver_fini(adev);
> >>>>>>>> +        amdgpu_fbdev_fini(adev);
> >>>>>>>> +        /* Try to close drm device to stop applications
> >>>>>>>> +         * from opening dri files for further IO operations.
> >>>>>>>> +         * TODO: This will throw warning as ttm is not
> >>>>>>>> +         * cleaned perperly */
> >>>>>>>> +        drm_dev_fini(dev);
> >>>>>>>
> >>>>>>> I think user mode applications might still hold reference to the drm device
> >>>>>>> through through drm_dev_get either by directly opening
> >>>>>>> the device file or indirectly through importing DMA buff, if so when the
> >>>>>>> last of them terminate drm_dev_put->drm_dev_release->...->drm_dev_fini
> >>>>>>> might get called again causing use after free e.t.c issues. Maybe better to
> >>>>>>> call here drm_dev_put then and so drm_dev_fini will get called when this
> >>>>>>> last user client releases his reference.
> >>>>>>
> >>>>>> drm_dev_fini() seems to be cleaner. Problem is  window manager(sway) never
> >>>>>> gets terminated after the AER error and drm files remains active. Simple cat
> >>>>>> on dri files
> >>>>>>
> >>>>>> goes though amdgpu and spits out more errors.
> >>>>>
> >>>>> What happens if you kill the window manager after you closed drm device with
> >>>>> your original code applied ? I would expect drm_dev_fini to be called again
> >>>>> for the reason i explained above and this would obviously would be wrong to
> >>>>> happen.
> >>>> Hi Andrey,
> >>>>
> >>>>
> >>>> hmm I quickly tried that, Kernel crashed and later rebooted after sometime. I
> >>>> don't have a serial console to check logs and there was no logs afterwards in
> >>>> journalctl.
> >>>>
> >>>> drm_dev_put() had similar behavior, kernel/machine was inaccessible over ssh.
> >>>>
> >>>>
> >>>> Did you face same behavior while testing gpu hotplug ?
> >>>>
> >>>>
> >>>> Nirmoy
> >>>
> >>> Yea, in my case device sysfs structure was removed on pci_remove while when last
> >>> user client dropped reference and this led
> >>> to drm_dev_fini to be called there were more sysfs entries removal there which
> >>> lead to a crash. But here i don't think the sysfs for drm_device
> >>> is removed because the device is not extracted...
> >>>
> >>> Andrey
> >>>
> >>>
> >>>>
> >>>>> Andrey
> >>>>>
> >>>>>
> >>>>>>
> >>>>>>> Also a general question - in my work on DPC recovery feature which tries to
> >>>>>>> recover after PCIe error - once the PCI error has happened MMIO registers
> >>>>>>> become
> >>>>>>> unaccessible for r/w as the PCI link is dead until after the PCI link is
> >>>>>>> reset by the DPC driver (see
> >>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.kernel.org%2Fdoc%2Fhtml%2Flatest%2FPCI%2Fpci-error-recovery.html&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=ukcNKOfdzIo7CdKr0yq78vLgqKkzHtqa%2FangK7OEYsA%3D&amp;reserved=0 section
> >>>>>>> 6.1.4).
> >>>>>>> Your case is to try and gracefully to close the drm device once fatal error
> >>>>>>> happened, didn't you encounter errors or warnings when accessing HW
> >>>>>>> registers during any of the operations
> >>>>>>> above ?
> >>>>>>
> >>>>>> As discussed over chat, it seems aer generated with aer-inject tool just
> >>>>>> triggers kernel PCI error APIs but the device is still active so I didn't
> >>>>>> encounter any errors when accessing HW registers.
> >>>>>>
> >>>>>>
> >>>>>> Nirmoy
> >>>>>>
> >>>>>>
> >>>>>>> Andrey
> >>>>>>>
> >>>>>>>
> >>>>>>>> +        break;
> >>>>>>>> +    }
> >>>>>>>> +
> >>>>>>>> +    return ret;
> >>>>>>>> +}
> >>>>>>>> +
> >>>>>>>> +static const struct pci_error_handlers amdgpu_err_handler = {
> >>>>>>>> +       .error_detected = amdgpu_pci_err_detected,
> >>>>>>>> +};
> >>>>>>>> +
> >>>>>>>> +
> >>>>>>>>    static struct pci_driver amdgpu_kms_pci_driver = {
> >>>>>>>>        .name = DRIVER_NAME,
> >>>>>>>>        .id_table = pciidlist,
> >>>>>>>> @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
> >>>>>>>>        .remove = amdgpu_pci_remove,
> >>>>>>>>        .shutdown = amdgpu_pci_shutdown,
> >>>>>>>>        .driver.pm = &amdgpu_pm_ops,
> >>>>>>>> +    .err_handler = &amdgpu_err_handler,
> >>>>>>>>    };
> >>>>>>>>    -
> >>>>>>>> -
> >>>>>>>>    static int __init amdgpu_init(void)
> >>>>>>>>    {
> >>>>>>>>        int r;
> >>> _______________________________________________
> >>> amd-gfx mailing list
> >>> amd-gfx@lists.freedesktop.org
> >>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7C4490887b817b47b029a808d83fb5525f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637329395327201326&amp;sdata=hUiqeBUx%2BhXqgL21ewNbPBP0WQp1i66av5CDeg9CF38%3D&amp;reserved=0
> >>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler
  2020-08-14 20:10                 ` Alex Deucher
@ 2020-08-14 20:20                   ` Luben Tuikov
  0 siblings, 0 replies; 13+ messages in thread
From: Luben Tuikov @ 2020-08-14 20:20 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Andrey Grodzovsky, Nirmoy, amd-gfx list, Nirmoy Das, Deucher,
	Alexander, Christian Koenig

On 2020-08-14 4:10 p.m., Alex Deucher wrote:
>> I see DRM as more of a unifying layer (perhaps long term), as opposed
>> to a *library* which LLDDs call into. Then LLDDs would provide an interface
>> to the hardware. This will help us avoid many of the deadlocks and
>> synchronization issues which were outlined in a recent memory presentation.
>> It will also help us avoid code redundancy in LLDDs.
> 
> We are actually trying to move away from that model.  drm started out
> as a midlayer and it caused all kinds of pain.  E.g., some drm devices
> are usb devices, some are pci, some are platform devices, etc.  A flow
> that worked for one didn't always work well for another.  In general
> we should follow the driver model for the specific device model of
> your driver and then use drm helper functions to deal with the core
> drm functionality.  E.g., drm shouldn't be in the business of dealing
> with pci power management.  The driver should interact directly with
> the pci subsystem for that.

Ah, thanks Alex for the clarification. So a library of sorts--got it.

Regards,
Luben
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2020-08-14 20:20 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-11 13:30 [RFC PATCH 1/1] drm/amdgpu: add initial support for pci error handler Nirmoy Das
2020-08-12 14:52 ` Andrey Grodzovsky
2020-08-13 11:09   ` Nirmoy
2020-08-13 13:36     ` Alex Deucher
2020-08-13 13:38     ` Andrey Grodzovsky
2020-08-13 15:06       ` Nirmoy
2020-08-13 18:18         ` Andrey Grodzovsky
2020-08-13 21:17           ` Luben Tuikov
2020-08-13 21:30             ` Luben Tuikov
2020-08-14 15:23             ` Nirmoy
2020-08-14 19:52               ` Luben Tuikov
2020-08-14 20:10                 ` Alex Deucher
2020-08-14 20:20                   ` Luben Tuikov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.