All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alex Deucher <alexdeucher@gmail.com>
To: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Cc: "Deucher, Alexander" <alexander.deucher@amd.com>,
	Nirmoy <nirmodas@amd.com>,
	Christian Koenig <christian.koenig@amd.com>,
	amd-gfx list <amd-gfx@lists.freedesktop.org>,
	Dennis Li <Dennis.Li@amd.com>
Subject: Re: [PATCH v2 5/7] drm/amdgpu: Fix consecutive DPC recovery failures.
Date: Fri, 28 Aug 2020 15:19:36 -0400	[thread overview]
Message-ID: <CADnq5_PrsW0rrkeKOgYm5ZG7f86etycgmnbYYbewGXozGLwoqA@mail.gmail.com> (raw)
In-Reply-To: <1598630743-21155-6-git-send-email-andrey.grodzovsky@amd.com>

On Fri, Aug 28, 2020 at 12:06 PM Andrey Grodzovsky
<andrey.grodzovsky@amd.com> wrote:
>
> Cache the PCI state on boot and before each case were we might
> loose it.
>
> v2: Add pci_restore_state while caching the PCI state to avoid
> breaking PCI core logic for stuff like suspend/resume.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 ++++++++++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 +-
>  drivers/gpu/drm/amd/amdgpu/nv.c            |  4 +-
>  drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +-
>  5 files changed, 67 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index cac51e8..5e74db6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -992,7 +992,9 @@ struct amdgpu_device {
>         atomic_t                        throttling_logging_enabled;
>         struct ratelimit_state          throttling_logging_rs;
>         uint32_t                        ras_features;
> +
>         bool                            in_pci_err_recovery;
> +       struct pci_saved_state          *pci_state;
>  };
>
>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> @@ -1272,6 +1274,10 @@ pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
>  pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev);
>  void amdgpu_pci_resume(struct pci_dev *pdev);
>
> +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev);
> +bool amdgpu_device_load_pci_state(struct pci_dev *pdev);
> +
> +
>
>  #include "amdgpu_object.h"
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 06664a9..7f1b970 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1284,7 +1284,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
>                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
>
>                 pci_set_power_state(dev->pdev, PCI_D0);
> -               pci_restore_state(dev->pdev);
> +               amdgpu_device_load_pci_state(dev->pdev);
>                 r = pci_enable_device(dev->pdev);
>                 if (r)
>                         DRM_WARN("pci_enable_device failed (%d)\n", r);
> @@ -1297,7 +1297,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
>                 drm_kms_helper_poll_disable(dev);
>                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
>                 amdgpu_device_suspend(dev, true);
> -               pci_save_state(dev->pdev);
> +               amdgpu_device_cache_pci_state(dev->pdev);
>                 /* Shut down the device */
>                 pci_disable_device(dev->pdev);
>                 pci_set_power_state(dev->pdev, PCI_D3cold);
> @@ -3402,6 +3402,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         if (r)
>                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
>
> +       /* Have stored pci confspace at hand for restore in sudden PCI error */
> +       if (!amdgpu_device_cache_pci_state(adev->pdev))
> +               DRM_WARN("Failed to cache PCI state!");

We should call pci_restore_state(pdev) here rather than in the helpers
otherwise we incur the extra overhead in all cases and it's not
necessary.

>         return 0;
>
>  failed:
> @@ -3428,6 +3431,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
>         flush_delayed_work(&adev->delayed_init_work);
>         adev->shutdown = true;
>
> +       kfree(adev->pci_state);
> +
>         /* make sure IB test finished before entering exclusive mode
>          * to avoid preemption on IB test
>          * */
> @@ -4853,7 +4858,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>         /* wait for asic to come out of reset */
>         msleep(500);
>
> -       pci_restore_state(pdev);
> +       amdgpu_device_load_pci_state(pdev);
>
>         /* confirm  ASIC came out of reset */
>         for (i = 0; i < adev->usec_timeout; i++) {
> @@ -4932,8 +4937,10 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>
>  out:
>
> -       if (!r)
> +       if (!r) {
> +               amdgpu_device_cache_pci_state(adev->pdev);
>                 DRM_INFO("PCIe error recovery succeeded\n");
> +       }
>         else {
>                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
>                 amdgpu_device_unlock_adev(adev);
> @@ -4972,3 +4979,47 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
>
>         amdgpu_device_unlock_adev(adev);
>  }
> +
> +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
> +{
> +       struct drm_device *dev = pci_get_drvdata(pdev);
> +       struct amdgpu_device *adev = drm_to_adev(dev);
> +       int r;
> +
> +       r = pci_save_state(pdev);
> +       if (!r) {
> +               kfree(adev->pci_state);
> +
> +               adev->pci_state = pci_store_saved_state(pdev);
> +               pci_restore_state(pdev);

We don't want to restore this here.  See my comment above.

> +
> +               if (!adev->pci_state) {
> +                       DRM_ERROR("Failed to store PCI saved state");
> +                       return false;
> +               }
> +       } else {
> +               DRM_WARN("Failed to save PCI state, err:%d\n", r);
> +               return false;
> +       }
> +
> +       return true;
> +}
> +
> +bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
> +{
> +       struct drm_device *dev = pci_get_drvdata(pdev);
> +       struct amdgpu_device *adev = drm_to_adev(dev);
> +       int r;
> +
> +       if (!adev->pci_state)
> +               return false;
> +
> +       r = pci_load_saved_state(pdev, adev->pci_state);
> +
> +       if (!r) {
> +               pci_restore_state(pdev);
> +       } else {
> +               DRM_WARN("Failed to load PCI state, err:%d\n", r);
> +               return false;
> +       }
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 4bbcc70..7a6482a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -1320,7 +1320,7 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev)
>                 if (amdgpu_is_atpx_hybrid()) {
>                         pci_ignore_hotplug(pdev);
>                 } else {
> -                       pci_save_state(pdev);
> +                       amdgpu_device_cache_pci_state(pdev);
>                         pci_disable_device(pdev);
>                         pci_ignore_hotplug(pdev);
>                         pci_set_power_state(pdev, PCI_D3cold);
> @@ -1353,7 +1353,7 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)
>                         pci_set_master(pdev);
>                 } else {
>                         pci_set_power_state(pdev, PCI_D0);
> -                       pci_restore_state(pdev);
> +                       amdgpu_device_load_pci_state(pdev);
>                         ret = pci_enable_device(pdev);
>                         if (ret)
>                                 return ret;
> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
> index 4d14023..0ec6603 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
> @@ -311,7 +311,7 @@ static int nv_asic_mode1_reset(struct amdgpu_device *adev)
>         /* disable BM */
>         pci_clear_master(adev->pdev);
>
> -       pci_save_state(adev->pdev);
> +       amdgpu_device_cache_pci_state(adev->pdev);
>
>         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
>                 dev_info(adev->dev, "GPU smu mode1 reset\n");
> @@ -323,7 +323,7 @@ static int nv_asic_mode1_reset(struct amdgpu_device *adev)
>
>         if (ret)
>                 dev_err(adev->dev, "GPU mode1 reset failed\n");
> -       pci_restore_state(adev->pdev);
> +       amdgpu_device_load_pci_state(adev->pdev);
>
>         /* wait for asic to come out of reset */
>         for (i = 0; i < adev->usec_timeout; i++) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index 2f93c47..ddd55e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -484,13 +484,13 @@ static int soc15_asic_mode1_reset(struct amdgpu_device *adev)
>         /* disable BM */
>         pci_clear_master(adev->pdev);
>
> -       pci_save_state(adev->pdev);
> +       amdgpu_device_cache_pci_state(adev->pdev);
>
>         ret = psp_gpu_reset(adev);
>         if (ret)
>                 dev_err(adev->dev, "GPU mode1 reset failed\n");
>
> -       pci_restore_state(adev->pdev);
> +       amdgpu_device_load_pci_state(adev->pdev);
>
>         /* wait for asic to come out of reset */
>         for (i = 0; i < adev->usec_timeout; i++) {
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  reply	other threads:[~2020-08-28 19:19 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-28 16:05 [PATCH v2 0/7] Implement PCI Error Recovery on Navi12 Andrey Grodzovsky
2020-08-28 16:05 ` [PATCH v2 1/7] drm/amdgpu: Implement DPC recovery Andrey Grodzovsky
2020-08-28 19:23   ` Alex Deucher
2020-08-28 19:24     ` Alex Deucher
2020-08-31 14:26     ` Andrey Grodzovsky
2020-08-31 14:30       ` Alex Deucher
2020-08-28 19:25   ` Alex Deucher
2020-08-31 12:44   ` Christian König
2020-08-28 16:05 ` [PATCH v2 2/7] drm/amdgpu: Avoid accessing HW when suspending SW state Andrey Grodzovsky
2020-08-28 19:26   ` Alex Deucher
2020-08-31 20:19     ` Luben Tuikov
2020-08-28 16:05 ` [PATCH v2 3/7] drm/amdgpu: Block all job scheduling activity during DPC recovery Andrey Grodzovsky
2020-08-28 19:28   ` Alex Deucher
2020-08-28 16:05 ` [PATCH v2 4/7] drm/amdgpu: Fix SMU error failure Andrey Grodzovsky
2020-08-28 19:29   ` Alex Deucher
2020-08-28 20:28     ` Andrey Grodzovsky
2020-08-28 16:05 ` [PATCH v2 5/7] drm/amdgpu: Fix consecutive DPC recovery failures Andrey Grodzovsky
2020-08-28 19:19   ` Alex Deucher [this message]
2020-08-28 16:05 ` [PATCH v2 6/7] drm/amdgpu: Trim amdgpu_pci_slot_reset by reusing code Andrey Grodzovsky
2020-08-28 19:30   ` Alex Deucher
2020-08-28 16:05 ` [PATCH v2 7/7] drm/amdgpu: Disable DPC for XGMI for now Andrey Grodzovsky
2020-08-28 19:30   ` Alex Deucher
2020-08-28 19:31     ` Alex Deucher

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CADnq5_PrsW0rrkeKOgYm5ZG7f86etycgmnbYYbewGXozGLwoqA@mail.gmail.com \
    --to=alexdeucher@gmail.com \
    --cc=Dennis.Li@amd.com \
    --cc=alexander.deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=andrey.grodzovsky@amd.com \
    --cc=christian.koenig@amd.com \
    --cc=nirmodas@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.