All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Fix crash on device remove/driver unload
@ 2021-09-15 19:41 Andrey Grodzovsky
  2021-09-16  7:26 ` Pan, Xinhui
  2021-09-16  8:20 ` Lazar, Lijo
  0 siblings, 2 replies; 7+ messages in thread
From: Andrey Grodzovsky @ 2021-09-15 19:41 UTC (permalink / raw)
  To: amd-gfx; +Cc: evan.quan, xinhui.pan, alexander.deucher, Andrey Grodzovsky

Crash:
BUG: unable to handle page fault for address: 00000000000010e1
RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
Call Trace:
pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
amdgpu_pci_remove+0x27/0x40 [amdgpu]
pci_device_remove+0x3e/0xb0
device_release_driver_internal+0x103/0x1d0
device_release_driver+0x12/0x20
pci_stop_bus_device+0x79/0xa0
pci_stop_and_remove_bus_device_locked+0x1b/0x30
remove_store+0x7b/0x90
dev_attr_store+0x17/0x30
sysfs_kf_write+0x4b/0x60
kernfs_fop_write_iter+0x151/0x1e0

Why:
VCE/UVD had dependency on SMC block for their suspend but
SMC block is the first to do HW fini due to some constraints

How:
Since the original patch was dealing with suspend issues
move the SMC block dependency back into suspend hooks as
was done in V1 of the original patches.
Keep flushing idle work both in suspend and HW fini seuqnces
since it's essential in both cases.

Fixes:
2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend
ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
 drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
 drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
 drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
 drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
 drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
 7 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
index 7232241e3bfb..0fef925b6602 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
@@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+	if (RREG32(mmUVD_STATUS) != 0)
+		uvd_v3_1_stop(adev);
+
+	return 0;
+}
+
+static int uvd_v3_1_suspend(void *handle)
+{
+	int r;
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
 	/*
 	 * Proper cleanups before halting the HW engine:
 	 *   - cancel the delayed idle work
@@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
 						       AMD_CG_STATE_GATE);
 	}
 
-	if (RREG32(mmUVD_STATUS) != 0)
-		uvd_v3_1_stop(adev);
-
-	return 0;
-}
-
-static int uvd_v3_1_suspend(void *handle)
-{
-	int r;
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
 	r = uvd_v3_1_hw_fini(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
index 52d6de969f46..c108b8381795 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
@@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+	if (RREG32(mmUVD_STATUS) != 0)
+		uvd_v4_2_stop(adev);
+
+	return 0;
+}
+
+static int uvd_v4_2_suspend(void *handle)
+{
+	int r;
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
 	/*
 	 * Proper cleanups before halting the HW engine:
 	 *   - cancel the delayed idle work
@@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
 						       AMD_CG_STATE_GATE);
 	}
 
-	if (RREG32(mmUVD_STATUS) != 0)
-		uvd_v4_2_stop(adev);
-
-	return 0;
-}
-
-static int uvd_v4_2_suspend(void *handle)
-{
-	int r;
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
 	r = uvd_v4_2_hw_fini(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
index db6d06758e4d..563493d1f830 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
@@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+	if (RREG32(mmUVD_STATUS) != 0)
+		uvd_v5_0_stop(adev);
+
+	return 0;
+}
+
+static int uvd_v5_0_suspend(void *handle)
+{
+	int r;
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
 	/*
 	 * Proper cleanups before halting the HW engine:
 	 *   - cancel the delayed idle work
@@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
 						       AMD_CG_STATE_GATE);
 	}
 
-	if (RREG32(mmUVD_STATUS) != 0)
-		uvd_v5_0_stop(adev);
-
-	return 0;
-}
-
-static int uvd_v5_0_suspend(void *handle)
-{
-	int r;
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
 	r = uvd_v5_0_hw_fini(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
index b6e82d75561f..1fd9ca21a091 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
@@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+	if (!amdgpu_sriov_vf(adev))
+		uvd_v7_0_stop(adev);
+	else {
+		/* full access mode, so don't touch any UVD register */
+		DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
+	}
+
+	return 0;
+}
+
+static int uvd_v7_0_suspend(void *handle)
+{
+	int r;
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
 	/*
 	 * Proper cleanups before halting the HW engine:
 	 *   - cancel the delayed idle work
@@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
 						       AMD_CG_STATE_GATE);
 	}
 
-	if (!amdgpu_sriov_vf(adev))
-		uvd_v7_0_stop(adev);
-	else {
-		/* full access mode, so don't touch any UVD register */
-		DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
-	}
-
-	return 0;
-}
-
-static int uvd_v7_0_suspend(void *handle)
-{
-	int r;
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
 	r = uvd_v7_0_hw_fini(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
index 84e488f189f5..67eb01fef789 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
@@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	cancel_delayed_work_sync(&adev->vce.idle_work);
+
+	return 0;
+}
+
+static int vce_v2_0_suspend(void *handle)
+{
+	int r;
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+
 	/*
 	 * Proper cleanups before halting the HW engine:
 	 *   - cancel the delayed idle work
@@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
 						       AMD_CG_STATE_GATE);
 	}
 
-	return 0;
-}
-
-static int vce_v2_0_suspend(void *handle)
-{
-	int r;
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
 	r = vce_v2_0_hw_fini(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
index 2a18c1e089fd..142e291983b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
@@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
 	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	cancel_delayed_work_sync(&adev->vce.idle_work);
+
+	r = vce_v3_0_wait_for_idle(handle);
+	if (r)
+		return r;
+
+	vce_v3_0_stop(adev);
+	return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
+}
+
+static int vce_v3_0_suspend(void *handle)
+{
+	int r;
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
 	/*
 	 * Proper cleanups before halting the HW engine:
 	 *   - cancel the delayed idle work
@@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
 						       AMD_CG_STATE_GATE);
 	}
 
-	r = vce_v3_0_wait_for_idle(handle);
-	if (r)
-		return r;
-
-	vce_v3_0_stop(adev);
-	return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
-}
-
-static int vce_v3_0_suspend(void *handle)
-{
-	int r;
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
 	r = vce_v3_0_hw_fini(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
index 044cf9d74b85..226b79254db8 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
@@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	/*
-	 * Proper cleanups before halting the HW engine:
-	 *   - cancel the delayed idle work
-	 *   - enable powergating
-	 *   - enable clockgating
-	 *   - disable dpm
-	 *
-	 * TODO: to align with the VCN implementation, move the
-	 * jobs for clockgating/powergating/dpm setting to
-	 * ->set_powergating_state().
-	 */
 	cancel_delayed_work_sync(&adev->vce.idle_work);
 
-	if (adev->pm.dpm_enabled) {
-		amdgpu_dpm_enable_vce(adev, false);
-	} else {
-		amdgpu_asic_set_vce_clocks(adev, 0, 0);
-		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-						       AMD_PG_STATE_GATE);
-		amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-						       AMD_CG_STATE_GATE);
-	}
-
 	if (!amdgpu_sriov_vf(adev)) {
 		/* vce_v4_0_wait_for_idle(handle); */
 		vce_v4_0_stop(adev);
@@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
 		drm_dev_exit(idx);
 	}
 
+	/*
+	 * Proper cleanups before halting the HW engine:
+	 *   - cancel the delayed idle work
+	 *   - enable powergating
+	 *   - enable clockgating
+	 *   - disable dpm
+	 *
+	 * TODO: to align with the VCN implementation, move the
+	 * jobs for clockgating/powergating/dpm setting to
+	 * ->set_powergating_state().
+	 */
+	cancel_delayed_work_sync(&adev->vce.idle_work);
+
+	if (adev->pm.dpm_enabled) {
+		amdgpu_dpm_enable_vce(adev, false);
+	} else {
+		amdgpu_asic_set_vce_clocks(adev, 0, 0);
+		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+						       AMD_PG_STATE_GATE);
+		amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+						       AMD_CG_STATE_GATE);
+	}
+
 	r = vce_v4_0_hw_fini(adev);
 	if (r)
 		return r;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload
  2021-09-15 19:41 [PATCH] drm/amdgpu: Fix crash on device remove/driver unload Andrey Grodzovsky
@ 2021-09-16  7:26 ` Pan, Xinhui
  2021-09-16  8:20 ` Lazar, Lijo
  1 sibling, 0 replies; 7+ messages in thread
From: Pan, Xinhui @ 2021-09-16  7:26 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx
  Cc: Quan, Evan, Deucher, Alexander, Grodzovsky, Andrey

[AMD Official Use Only]

Reviewed-by: xinhui pan <xinhui.pan@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Andrey Grodzovsky
Sent: 2021年9月16日 3:42
To: amd-gfx@lists.freedesktop.org
Cc: Quan, Evan <Evan.Quan@amd.com>; Pan, Xinhui <Xinhui.Pan@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload

Crash:
BUG: unable to handle page fault for address: 00000000000010e1
RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu] Call Trace:
pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
amdgpu_pci_remove+0x27/0x40 [amdgpu]
pci_device_remove+0x3e/0xb0
device_release_driver_internal+0x103/0x1d0
device_release_driver+0x12/0x20
pci_stop_bus_device+0x79/0xa0
pci_stop_and_remove_bus_device_locked+0x1b/0x30
remove_store+0x7b/0x90
dev_attr_store+0x17/0x30
sysfs_kf_write+0x4b/0x60
kernfs_fop_write_iter+0x151/0x1e0

Why:
VCE/UVD had dependency on SMC block for their suspend but SMC block is the first to do HW fini due to some constraints

How:
Since the original patch was dealing with suspend issues move the SMC block dependency back into suspend hooks as was done in V1 of the original patches.
Keep flushing idle work both in suspend and HW fini seuqnces since it's essential in both cases.

Fixes:
2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------  drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------  drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------  drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------  drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----  drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------  drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
 7 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
index 7232241e3bfb..0fef925b6602 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
@@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (RREG32(mmUVD_STATUS) != 0)
+               uvd_v3_1_stop(adev);
+
+       return 0;
+}
+
+static int uvd_v3_1_suspend(void *handle) {
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }

-       if (RREG32(mmUVD_STATUS) != 0)
-               uvd_v3_1_stop(adev);
-
-       return 0;
-}
-
-static int uvd_v3_1_suspend(void *handle) -{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v3_1_hw_fini(adev);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
index 52d6de969f46..c108b8381795 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
@@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (RREG32(mmUVD_STATUS) != 0)
+               uvd_v4_2_stop(adev);
+
+       return 0;
+}
+
+static int uvd_v4_2_suspend(void *handle) {
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }

-       if (RREG32(mmUVD_STATUS) != 0)
-               uvd_v4_2_stop(adev);
-
-       return 0;
-}
-
-static int uvd_v4_2_suspend(void *handle) -{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v4_2_hw_fini(adev);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
index db6d06758e4d..563493d1f830 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
@@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (RREG32(mmUVD_STATUS) != 0)
+               uvd_v5_0_stop(adev);
+
+       return 0;
+}
+
+static int uvd_v5_0_suspend(void *handle) {
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }

-       if (RREG32(mmUVD_STATUS) != 0)
-               uvd_v5_0_stop(adev);
-
-       return 0;
-}
-
-static int uvd_v5_0_suspend(void *handle) -{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v5_0_hw_fini(adev);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
index b6e82d75561f..1fd9ca21a091 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
@@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (!amdgpu_sriov_vf(adev))
+               uvd_v7_0_stop(adev);
+       else {
+               /* full access mode, so don't touch any UVD register */
+               DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
+       }
+
+       return 0;
+}
+
+static int uvd_v7_0_suspend(void *handle) {
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }

-       if (!amdgpu_sriov_vf(adev))
-               uvd_v7_0_stop(adev);
-       else {
-               /* full access mode, so don't touch any UVD register */
-               DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
-       }
-
-       return 0;
-}
-
-static int uvd_v7_0_suspend(void *handle) -{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v7_0_hw_fini(adev);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
index 84e488f189f5..67eb01fef789 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
@@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

+       cancel_delayed_work_sync(&adev->vce.idle_work);
+
+       return 0;
+}
+
+static int vce_v2_0_suspend(void *handle) {
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }

-       return 0;
-}
-
-static int vce_v2_0_suspend(void *handle) -{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = vce_v2_0_hw_fini(adev);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
index 2a18c1e089fd..142e291983b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
@@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
        int r;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

+       cancel_delayed_work_sync(&adev->vce.idle_work);
+
+       r = vce_v3_0_wait_for_idle(handle);
+       if (r)
+               return r;
+
+       vce_v3_0_stop(adev);
+       return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE); }
+
+static int vce_v3_0_suspend(void *handle) {
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }

-       r = vce_v3_0_wait_for_idle(handle);
-       if (r)
-               return r;
-
-       vce_v3_0_stop(adev);
-       return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
-}
-
-static int vce_v3_0_suspend(void *handle) -{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = vce_v3_0_hw_fini(adev);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
index 044cf9d74b85..226b79254db8 100644
--- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
@@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)  {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;

-       /*
-        * Proper cleanups before halting the HW engine:
-        *   - cancel the delayed idle work
-        *   - enable powergating
-        *   - enable clockgating
-        *   - disable dpm
-        *
-        * TODO: to align with the VCN implementation, move the
-        * jobs for clockgating/powergating/dpm setting to
-        * ->set_powergating_state().
-        */
        cancel_delayed_work_sync(&adev->vce.idle_work);

-       if (adev->pm.dpm_enabled) {
-               amdgpu_dpm_enable_vce(adev, false);
-       } else {
-               amdgpu_asic_set_vce_clocks(adev, 0, 0);
-               amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-                                                      AMD_PG_STATE_GATE);
-               amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-                                                      AMD_CG_STATE_GATE);
-       }
-
        if (!amdgpu_sriov_vf(adev)) {
                /* vce_v4_0_wait_for_idle(handle); */
                vce_v4_0_stop(adev);
@@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
                drm_dev_exit(idx);
        }

+       /*
+        * Proper cleanups before halting the HW engine:
+        *   - cancel the delayed idle work
+        *   - enable powergating
+        *   - enable clockgating
+        *   - disable dpm
+        *
+        * TODO: to align with the VCN implementation, move the
+        * jobs for clockgating/powergating/dpm setting to
+        * ->set_powergating_state().
+        */
+       cancel_delayed_work_sync(&adev->vce.idle_work);
+
+       if (adev->pm.dpm_enabled) {
+               amdgpu_dpm_enable_vce(adev, false);
+       } else {
+               amdgpu_asic_set_vce_clocks(adev, 0, 0);
+               amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+                                                      AMD_PG_STATE_GATE);
+               amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+                                                      AMD_CG_STATE_GATE);
+       }
+
        r = vce_v4_0_hw_fini(adev);
        if (r)
                return r;
--
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload
  2021-09-15 19:41 [PATCH] drm/amdgpu: Fix crash on device remove/driver unload Andrey Grodzovsky
  2021-09-16  7:26 ` Pan, Xinhui
@ 2021-09-16  8:20 ` Lazar, Lijo
  2021-09-16 15:45   ` Andrey Grodzovsky
  1 sibling, 1 reply; 7+ messages in thread
From: Lazar, Lijo @ 2021-09-16  8:20 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx; +Cc: evan.quan, xinhui.pan, alexander.deucher

A minor comment below.

On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
> Crash:
> BUG: unable to handle page fault for address: 00000000000010e1
> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
> Call Trace:
> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
> amdgpu_pci_remove+0x27/0x40 [amdgpu]
> pci_device_remove+0x3e/0xb0
> device_release_driver_internal+0x103/0x1d0
> device_release_driver+0x12/0x20
> pci_stop_bus_device+0x79/0xa0
> pci_stop_and_remove_bus_device_locked+0x1b/0x30
> remove_store+0x7b/0x90
> dev_attr_store+0x17/0x30
> sysfs_kf_write+0x4b/0x60
> kernfs_fop_write_iter+0x151/0x1e0
> 
> Why:
> VCE/UVD had dependency on SMC block for their suspend but
> SMC block is the first to do HW fini due to some constraints
> 
> How:
> Since the original patch was dealing with suspend issues
> move the SMC block dependency back into suspend hooks as
> was done in V1 of the original patches.
> Keep flushing idle work both in suspend and HW fini seuqnces
> since it's essential in both cases.
> 
> Fixes:
> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend
> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>   drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
>   7 files changed, 105 insertions(+), 90 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
> index 7232241e3bfb..0fef925b6602 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (RREG32(mmUVD_STATUS) != 0)
> +		uvd_v3_1_stop(adev);
> +
> +	return 0;
> +}
> +
> +static int uvd_v3_1_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (RREG32(mmUVD_STATUS) != 0)
> -		uvd_v3_1_stop(adev);
> -
> -	return 0;
> -}
> -
> -static int uvd_v3_1_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v3_1_hw_fini(adev);

"cosmetic change" comment - hw_fini is supposed to be the final tear 
down call. So instead of suspend calling hw_fini, perhaps it makes sense 
to read the other way - hw_fini just suspends the hardware?

Thanks,
Lijo

>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
> index 52d6de969f46..c108b8381795 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (RREG32(mmUVD_STATUS) != 0)
> +		uvd_v4_2_stop(adev);
> +
> +	return 0;
> +}
> +
> +static int uvd_v4_2_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (RREG32(mmUVD_STATUS) != 0)
> -		uvd_v4_2_stop(adev);
> -
> -	return 0;
> -}
> -
> -static int uvd_v4_2_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v4_2_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
> index db6d06758e4d..563493d1f830 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (RREG32(mmUVD_STATUS) != 0)
> +		uvd_v5_0_stop(adev);
> +
> +	return 0;
> +}
> +
> +static int uvd_v5_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (RREG32(mmUVD_STATUS) != 0)
> -		uvd_v5_0_stop(adev);
> -
> -	return 0;
> -}
> -
> -static int uvd_v5_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v5_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> index b6e82d75561f..1fd9ca21a091 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +
> +	if (!amdgpu_sriov_vf(adev))
> +		uvd_v7_0_stop(adev);
> +	else {
> +		/* full access mode, so don't touch any UVD register */
> +		DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
> +	}
> +
> +	return 0;
> +}
> +
> +static int uvd_v7_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	if (!amdgpu_sriov_vf(adev))
> -		uvd_v7_0_stop(adev);
> -	else {
> -		/* full access mode, so don't touch any UVD register */
> -		DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
> -	}
> -
> -	return 0;
> -}
> -
> -static int uvd_v7_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = uvd_v7_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
> index 84e488f189f5..67eb01fef789 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->vce.idle_work);
> +
> +	return 0;
> +}
> +
> +static int vce_v2_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	return 0;
> -}
> -
> -static int vce_v2_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = vce_v2_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
> index 2a18c1e089fd..142e291983b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>   	int r;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	cancel_delayed_work_sync(&adev->vce.idle_work);
> +
> +	r = vce_v3_0_wait_for_idle(handle);
> +	if (r)
> +		return r;
> +
> +	vce_v3_0_stop(adev);
> +	return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
> +}
> +
> +static int vce_v3_0_suspend(void *handle)
> +{
> +	int r;
> +	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +
>   	/*
>   	 * Proper cleanups before halting the HW engine:
>   	 *   - cancel the delayed idle work
> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>   						       AMD_CG_STATE_GATE);
>   	}
>   
> -	r = vce_v3_0_wait_for_idle(handle);
> -	if (r)
> -		return r;
> -
> -	vce_v3_0_stop(adev);
> -	return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
> -}
> -
> -static int vce_v3_0_suspend(void *handle)
> -{
> -	int r;
> -	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> -
>   	r = vce_v3_0_hw_fini(adev);
>   	if (r)
>   		return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
> index 044cf9d74b85..226b79254db8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> -	/*
> -	 * Proper cleanups before halting the HW engine:
> -	 *   - cancel the delayed idle work
> -	 *   - enable powergating
> -	 *   - enable clockgating
> -	 *   - disable dpm
> -	 *
> -	 * TODO: to align with the VCN implementation, move the
> -	 * jobs for clockgating/powergating/dpm setting to
> -	 * ->set_powergating_state().
> -	 */
>   	cancel_delayed_work_sync(&adev->vce.idle_work);
>   
> -	if (adev->pm.dpm_enabled) {
> -		amdgpu_dpm_enable_vce(adev, false);
> -	} else {
> -		amdgpu_asic_set_vce_clocks(adev, 0, 0);
> -		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> -						       AMD_PG_STATE_GATE);
> -		amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> -						       AMD_CG_STATE_GATE);
> -	}
> -
>   	if (!amdgpu_sriov_vf(adev)) {
>   		/* vce_v4_0_wait_for_idle(handle); */
>   		vce_v4_0_stop(adev);
> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>   		drm_dev_exit(idx);
>   	}
>   
> +	/*
> +	 * Proper cleanups before halting the HW engine:
> +	 *   - cancel the delayed idle work
> +	 *   - enable powergating
> +	 *   - enable clockgating
> +	 *   - disable dpm
> +	 *
> +	 * TODO: to align with the VCN implementation, move the
> +	 * jobs for clockgating/powergating/dpm setting to
> +	 * ->set_powergating_state().
> +	 */
> +	cancel_delayed_work_sync(&adev->vce.idle_work);
> +
> +	if (adev->pm.dpm_enabled) {
> +		amdgpu_dpm_enable_vce(adev, false);
> +	} else {
> +		amdgpu_asic_set_vce_clocks(adev, 0, 0);
> +		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> +						       AMD_PG_STATE_GATE);
> +		amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
> +						       AMD_CG_STATE_GATE);
> +	}
> +
>   	r = vce_v4_0_hw_fini(adev);
>   	if (r)
>   		return r;
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload
  2021-09-16  8:20 ` Lazar, Lijo
@ 2021-09-16 15:45   ` Andrey Grodzovsky
  2021-09-16 15:51     ` Lazar, Lijo
  0 siblings, 1 reply; 7+ messages in thread
From: Andrey Grodzovsky @ 2021-09-16 15:45 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: evan.quan, xinhui.pan, alexander.deucher


On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:
> A minor comment below.
>
> On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
>> Crash:
>> BUG: unable to handle page fault for address: 00000000000010e1
>> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
>> Call Trace:
>> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
>> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
>> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
>> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
>> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
>> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
>> amdgpu_pci_remove+0x27/0x40 [amdgpu]
>> pci_device_remove+0x3e/0xb0
>> device_release_driver_internal+0x103/0x1d0
>> device_release_driver+0x12/0x20
>> pci_stop_bus_device+0x79/0xa0
>> pci_stop_and_remove_bus_device_locked+0x1b/0x30
>> remove_store+0x7b/0x90
>> dev_attr_store+0x17/0x30
>> sysfs_kf_write+0x4b/0x60
>> kernfs_fop_write_iter+0x151/0x1e0
>>
>> Why:
>> VCE/UVD had dependency on SMC block for their suspend but
>> SMC block is the first to do HW fini due to some constraints
>>
>> How:
>> Since the original patch was dealing with suspend issues
>> move the SMC block dependency back into suspend hooks as
>> was done in V1 of the original patches.
>> Keep flushing idle work both in suspend and HW fini seuqnces
>> since it's essential in both cases.
>>
>> Fixes:
>> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on 
>> UVD/VCE suspend
>> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE 
>> on suspend
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>>   drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>>   drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
>>   7 files changed, 105 insertions(+), 90 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c 
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>> index 7232241e3bfb..0fef925b6602 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>>   {
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> +    if (RREG32(mmUVD_STATUS) != 0)
>> +        uvd_v3_1_stop(adev);
>> +
>> +    return 0;
>> +}
>> +
>> +static int uvd_v3_1_suspend(void *handle)
>> +{
>> +    int r;
>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>>       /*
>>        * Proper cleanups before halting the HW engine:
>>        *   - cancel the delayed idle work
>> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>>                                  AMD_CG_STATE_GATE);
>>       }
>>   -    if (RREG32(mmUVD_STATUS) != 0)
>> -        uvd_v3_1_stop(adev);
>> -
>> -    return 0;
>> -}
>> -
>> -static int uvd_v3_1_suspend(void *handle)
>> -{
>> -    int r;
>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>>       r = uvd_v3_1_hw_fini(adev);
>
> "cosmetic change" comment - hw_fini is supposed to be the final tear 
> down call. So instead of suspend calling hw_fini, perhaps it makes 
> sense to read the other way - hw_fini just suspends the hardware?
>
> Thanks,
> Lijo


Not sure what you mean ?

Andrey


>
>>       if (r)
>>           return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c 
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>> index 52d6de969f46..c108b8381795 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>>   {
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> +    if (RREG32(mmUVD_STATUS) != 0)
>> +        uvd_v4_2_stop(adev);
>> +
>> +    return 0;
>> +}
>> +
>> +static int uvd_v4_2_suspend(void *handle)
>> +{
>> +    int r;
>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>>       /*
>>        * Proper cleanups before halting the HW engine:
>>        *   - cancel the delayed idle work
>> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>>                                  AMD_CG_STATE_GATE);
>>       }
>>   -    if (RREG32(mmUVD_STATUS) != 0)
>> -        uvd_v4_2_stop(adev);
>> -
>> -    return 0;
>> -}
>> -
>> -static int uvd_v4_2_suspend(void *handle)
>> -{
>> -    int r;
>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>>       r = uvd_v4_2_hw_fini(adev);
>>       if (r)
>>           return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>> index db6d06758e4d..563493d1f830 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>>   {
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> +    if (RREG32(mmUVD_STATUS) != 0)
>> +        uvd_v5_0_stop(adev);
>> +
>> +    return 0;
>> +}
>> +
>> +static int uvd_v5_0_suspend(void *handle)
>> +{
>> +    int r;
>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>>       /*
>>        * Proper cleanups before halting the HW engine:
>>        *   - cancel the delayed idle work
>> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>>                                  AMD_CG_STATE_GATE);
>>       }
>>   -    if (RREG32(mmUVD_STATUS) != 0)
>> -        uvd_v5_0_stop(adev);
>> -
>> -    return 0;
>> -}
>> -
>> -static int uvd_v5_0_suspend(void *handle)
>> -{
>> -    int r;
>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>>       r = uvd_v5_0_hw_fini(adev);
>>       if (r)
>>           return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>> index b6e82d75561f..1fd9ca21a091 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>>   {
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>> +
>> +    if (!amdgpu_sriov_vf(adev))
>> +        uvd_v7_0_stop(adev);
>> +    else {
>> +        /* full access mode, so don't touch any UVD register */
>> +        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int uvd_v7_0_suspend(void *handle)
>> +{
>> +    int r;
>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>>       /*
>>        * Proper cleanups before halting the HW engine:
>>        *   - cancel the delayed idle work
>> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>>                                  AMD_CG_STATE_GATE);
>>       }
>>   -    if (!amdgpu_sriov_vf(adev))
>> -        uvd_v7_0_stop(adev);
>> -    else {
>> -        /* full access mode, so don't touch any UVD register */
>> -        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>> -    }
>> -
>> -    return 0;
>> -}
>> -
>> -static int uvd_v7_0_suspend(void *handle)
>> -{
>> -    int r;
>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>>       r = uvd_v7_0_hw_fini(adev);
>>       if (r)
>>           return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>> index 84e488f189f5..67eb01fef789 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>>   {
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   +    cancel_delayed_work_sync(&adev->vce.idle_work);
>> +
>> +    return 0;
>> +}
>> +
>> +static int vce_v2_0_suspend(void *handle)
>> +{
>> +    int r;
>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>> +
>>       /*
>>        * Proper cleanups before halting the HW engine:
>>        *   - cancel the delayed idle work
>> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>>                                  AMD_CG_STATE_GATE);
>>       }
>>   -    return 0;
>> -}
>> -
>> -static int vce_v2_0_suspend(void *handle)
>> -{
>> -    int r;
>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>>       r = vce_v2_0_hw_fini(adev);
>>       if (r)
>>           return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>> index 2a18c1e089fd..142e291983b4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>>       int r;
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   +    cancel_delayed_work_sync(&adev->vce.idle_work);
>> +
>> +    r = vce_v3_0_wait_for_idle(handle);
>> +    if (r)
>> +        return r;
>> +
>> +    vce_v3_0_stop(adev);
>> +    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>> +}
>> +
>> +static int vce_v3_0_suspend(void *handle)
>> +{
>> +    int r;
>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> +
>>       /*
>>        * Proper cleanups before halting the HW engine:
>>        *   - cancel the delayed idle work
>> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>>                                  AMD_CG_STATE_GATE);
>>       }
>>   -    r = vce_v3_0_wait_for_idle(handle);
>> -    if (r)
>> -        return r;
>> -
>> -    vce_v3_0_stop(adev);
>> -    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>> -}
>> -
>> -static int vce_v3_0_suspend(void *handle)
>> -{
>> -    int r;
>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -
>>       r = vce_v3_0_hw_fini(adev);
>>       if (r)
>>           return r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>> index 044cf9d74b85..226b79254db8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>>   {
>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   -    /*
>> -     * Proper cleanups before halting the HW engine:
>> -     *   - cancel the delayed idle work
>> -     *   - enable powergating
>> -     *   - enable clockgating
>> -     *   - disable dpm
>> -     *
>> -     * TODO: to align with the VCN implementation, move the
>> -     * jobs for clockgating/powergating/dpm setting to
>> -     * ->set_powergating_state().
>> -     */
>>       cancel_delayed_work_sync(&adev->vce.idle_work);
>>   -    if (adev->pm.dpm_enabled) {
>> -        amdgpu_dpm_enable_vce(adev, false);
>> -    } else {
>> -        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>> -        amdgpu_device_ip_set_powergating_state(adev, 
>> AMD_IP_BLOCK_TYPE_VCE,
>> -                               AMD_PG_STATE_GATE);
>> -        amdgpu_device_ip_set_clockgating_state(adev, 
>> AMD_IP_BLOCK_TYPE_VCE,
>> -                               AMD_CG_STATE_GATE);
>> -    }
>> -
>>       if (!amdgpu_sriov_vf(adev)) {
>>           /* vce_v4_0_wait_for_idle(handle); */
>>           vce_v4_0_stop(adev);
>> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>>           drm_dev_exit(idx);
>>       }
>>   +    /*
>> +     * Proper cleanups before halting the HW engine:
>> +     *   - cancel the delayed idle work
>> +     *   - enable powergating
>> +     *   - enable clockgating
>> +     *   - disable dpm
>> +     *
>> +     * TODO: to align with the VCN implementation, move the
>> +     * jobs for clockgating/powergating/dpm setting to
>> +     * ->set_powergating_state().
>> +     */
>> +    cancel_delayed_work_sync(&adev->vce.idle_work);
>> +
>> +    if (adev->pm.dpm_enabled) {
>> +        amdgpu_dpm_enable_vce(adev, false);
>> +    } else {
>> +        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>> +        amdgpu_device_ip_set_powergating_state(adev, 
>> AMD_IP_BLOCK_TYPE_VCE,
>> +                               AMD_PG_STATE_GATE);
>> +        amdgpu_device_ip_set_clockgating_state(adev, 
>> AMD_IP_BLOCK_TYPE_VCE,
>> +                               AMD_CG_STATE_GATE);
>> +    }
>> +
>>       r = vce_v4_0_hw_fini(adev);
>>       if (r)
>>           return r;
>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload
  2021-09-16 15:45   ` Andrey Grodzovsky
@ 2021-09-16 15:51     ` Lazar, Lijo
  2021-09-16 16:17       ` Andrey Grodzovsky
  0 siblings, 1 reply; 7+ messages in thread
From: Lazar, Lijo @ 2021-09-16 15:51 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx; +Cc: evan.quan, xinhui.pan, alexander.deucher



On 9/16/2021 9:15 PM, Andrey Grodzovsky wrote:
> 
> On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:
>> A minor comment below.
>>
>> On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
>>> Crash:
>>> BUG: unable to handle page fault for address: 00000000000010e1
>>> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
>>> Call Trace:
>>> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
>>> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
>>> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
>>> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
>>> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
>>> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
>>> amdgpu_pci_remove+0x27/0x40 [amdgpu]
>>> pci_device_remove+0x3e/0xb0
>>> device_release_driver_internal+0x103/0x1d0
>>> device_release_driver+0x12/0x20
>>> pci_stop_bus_device+0x79/0xa0
>>> pci_stop_and_remove_bus_device_locked+0x1b/0x30
>>> remove_store+0x7b/0x90
>>> dev_attr_store+0x17/0x30
>>> sysfs_kf_write+0x4b/0x60
>>> kernfs_fop_write_iter+0x151/0x1e0
>>>
>>> Why:
>>> VCE/UVD had dependency on SMC block for their suspend but
>>> SMC block is the first to do HW fini due to some constraints
>>>
>>> How:
>>> Since the original patch was dealing with suspend issues
>>> move the SMC block dependency back into suspend hooks as
>>> was done in V1 of the original patches.
>>> Keep flushing idle work both in suspend and HW fini seuqnces
>>> since it's essential in both cases.
>>>
>>> Fixes:
>>> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on 
>>> UVD/VCE suspend
>>> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE 
>>> on suspend
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>>>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>>>   drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>>>   drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>>>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 ++++++++++++++-------------
>>>   7 files changed, 105 insertions(+), 90 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>> index 7232241e3bfb..0fef925b6602 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>> +        uvd_v3_1_stop(adev);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v3_1_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>> -        uvd_v3_1_stop(adev);
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v3_1_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v3_1_hw_fini(adev);
>>
>> "cosmetic change" comment - hw_fini is supposed to be the final tear 
>> down call. So instead of suspend calling hw_fini, perhaps it makes 
>> sense to read the other way - hw_fini just suspends the hardware?
>>
>> Thanks,
>> Lijo
> 
> 
> Not sure what you mean ?

Now it is - suspend()-> calls hw_fini()

What I meant is hw_fini() -> calls suspend() and that is read as "to do 
hw_fini() only suspend the hardware and nothing extra is needed".

In short implementation stays in suspend() and hw_fini() calls suspend().

Thanks,
Lijo

> 
> Andrey
> 
> 
>>
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>> index 52d6de969f46..c108b8381795 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>> +        uvd_v4_2_stop(adev);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v4_2_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>> -        uvd_v4_2_stop(adev);
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v4_2_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v4_2_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>> index db6d06758e4d..563493d1f830 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>> +        uvd_v5_0_stop(adev);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v5_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>> -        uvd_v5_0_stop(adev);
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v5_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v5_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>> index b6e82d75561f..1fd9ca21a091 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->uvd.idle_work);
>>> +
>>> +    if (!amdgpu_sriov_vf(adev))
>>> +        uvd_v7_0_stop(adev);
>>> +    else {
>>> +        /* full access mode, so don't touch any UVD register */
>>> +        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int uvd_v7_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    if (!amdgpu_sriov_vf(adev))
>>> -        uvd_v7_0_stop(adev);
>>> -    else {
>>> -        /* full access mode, so don't touch any UVD register */
>>> -        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>> -    }
>>> -
>>> -    return 0;
>>> -}
>>> -
>>> -static int uvd_v7_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = uvd_v7_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>> index 84e488f189f5..67eb01fef789 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int vce_v2_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    return 0;
>>> -}
>>> -
>>> -static int vce_v2_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = vce_v2_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>> index 2a18c1e089fd..142e291983b4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>>>       int r;
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>> +
>>> +    r = vce_v3_0_wait_for_idle(handle);
>>> +    if (r)
>>> +        return r;
>>> +
>>> +    vce_v3_0_stop(adev);
>>> +    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>> +}
>>> +
>>> +static int vce_v3_0_suspend(void *handle)
>>> +{
>>> +    int r;
>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> +
>>>       /*
>>>        * Proper cleanups before halting the HW engine:
>>>        *   - cancel the delayed idle work
>>> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>>>                                  AMD_CG_STATE_GATE);
>>>       }
>>>   -    r = vce_v3_0_wait_for_idle(handle);
>>> -    if (r)
>>> -        return r;
>>> -
>>> -    vce_v3_0_stop(adev);
>>> -    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>> -}
>>> -
>>> -static int vce_v3_0_suspend(void *handle)
>>> -{
>>> -    int r;
>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>> -
>>>       r = vce_v3_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>> index 044cf9d74b85..226b79254db8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>>>   {
>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>   -    /*
>>> -     * Proper cleanups before halting the HW engine:
>>> -     *   - cancel the delayed idle work
>>> -     *   - enable powergating
>>> -     *   - enable clockgating
>>> -     *   - disable dpm
>>> -     *
>>> -     * TODO: to align with the VCN implementation, move the
>>> -     * jobs for clockgating/powergating/dpm setting to
>>> -     * ->set_powergating_state().
>>> -     */
>>>       cancel_delayed_work_sync(&adev->vce.idle_work);
>>>   -    if (adev->pm.dpm_enabled) {
>>> -        amdgpu_dpm_enable_vce(adev, false);
>>> -    } else {
>>> -        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>> -        amdgpu_device_ip_set_powergating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> -                               AMD_PG_STATE_GATE);
>>> -        amdgpu_device_ip_set_clockgating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> -                               AMD_CG_STATE_GATE);
>>> -    }
>>> -
>>>       if (!amdgpu_sriov_vf(adev)) {
>>>           /* vce_v4_0_wait_for_idle(handle); */
>>>           vce_v4_0_stop(adev);
>>> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>>>           drm_dev_exit(idx);
>>>       }
>>>   +    /*
>>> +     * Proper cleanups before halting the HW engine:
>>> +     *   - cancel the delayed idle work
>>> +     *   - enable powergating
>>> +     *   - enable clockgating
>>> +     *   - disable dpm
>>> +     *
>>> +     * TODO: to align with the VCN implementation, move the
>>> +     * jobs for clockgating/powergating/dpm setting to
>>> +     * ->set_powergating_state().
>>> +     */
>>> +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>> +
>>> +    if (adev->pm.dpm_enabled) {
>>> +        amdgpu_dpm_enable_vce(adev, false);
>>> +    } else {
>>> +        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>> +        amdgpu_device_ip_set_powergating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> +                               AMD_PG_STATE_GATE);
>>> +        amdgpu_device_ip_set_clockgating_state(adev, 
>>> AMD_IP_BLOCK_TYPE_VCE,
>>> +                               AMD_CG_STATE_GATE);
>>> +    }
>>> +
>>>       r = vce_v4_0_hw_fini(adev);
>>>       if (r)
>>>           return r;
>>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload
  2021-09-16 15:51     ` Lazar, Lijo
@ 2021-09-16 16:17       ` Andrey Grodzovsky
  2021-09-16 16:40         ` Lazar, Lijo
  0 siblings, 1 reply; 7+ messages in thread
From: Andrey Grodzovsky @ 2021-09-16 16:17 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: evan.quan, xinhui.pan, alexander.deucher


On 2021-09-16 11:51 a.m., Lazar, Lijo wrote:
>
>
> On 9/16/2021 9:15 PM, Andrey Grodzovsky wrote:
>>
>> On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:
>>> A minor comment below.
>>>
>>> On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
>>>> Crash:
>>>> BUG: unable to handle page fault for address: 00000000000010e1
>>>> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
>>>> Call Trace:
>>>> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
>>>> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
>>>> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
>>>> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
>>>> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
>>>> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
>>>> amdgpu_pci_remove+0x27/0x40 [amdgpu]
>>>> pci_device_remove+0x3e/0xb0
>>>> device_release_driver_internal+0x103/0x1d0
>>>> device_release_driver+0x12/0x20
>>>> pci_stop_bus_device+0x79/0xa0
>>>> pci_stop_and_remove_bus_device_locked+0x1b/0x30
>>>> remove_store+0x7b/0x90
>>>> dev_attr_store+0x17/0x30
>>>> sysfs_kf_write+0x4b/0x60
>>>> kernfs_fop_write_iter+0x151/0x1e0
>>>>
>>>> Why:
>>>> VCE/UVD had dependency on SMC block for their suspend but
>>>> SMC block is the first to do HW fini due to some constraints
>>>>
>>>> How:
>>>> Since the original patch was dealing with suspend issues
>>>> move the SMC block dependency back into suspend hooks as
>>>> was done in V1 of the original patches.
>>>> Keep flushing idle work both in suspend and HW fini seuqnces
>>>> since it's essential in both cases.
>>>>
>>>> Fixes:
>>>> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on 
>>>> UVD/VCE suspend
>>>> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE 
>>>> on suspend
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>>>>   drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>>>>   drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>>>>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44 
>>>> ++++++++++++++-------------
>>>>   7 files changed, 105 insertions(+), 90 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>>> index 7232241e3bfb..0fef925b6602 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>>> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>>> +        uvd_v3_1_stop(adev);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v3_1_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>>> -        uvd_v3_1_stop(adev);
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v3_1_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v3_1_hw_fini(adev);
>>>
>>> "cosmetic change" comment - hw_fini is supposed to be the final tear 
>>> down call. So instead of suspend calling hw_fini, perhaps it makes 
>>> sense to read the other way - hw_fini just suspends the hardware?
>>>
>>> Thanks,
>>> Lijo
>>
>>
>> Not sure what you mean ?
>
> Now it is - suspend()-> calls hw_fini()
>
> What I meant is hw_fini() -> calls suspend() and that is read as "to 
> do hw_fini() only suspend the hardware and nothing extra is needed".
>
> In short implementation stays in suspend() and hw_fini() calls suspend().


Sorry, still confused, what about amdgpu_vce_suspend being called from 
vce_v4_0_suspend for example - we don't want that to be called from hw_fini.
Can you maybe show draft change of what you mean for one specific UVD or 
VCE version ?

Andrey


>
> Thanks,
> Lijo
>
>>
>> Andrey
>>
>>
>>>
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>>> index 52d6de969f46..c108b8381795 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>>> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>>> +        uvd_v4_2_stop(adev);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v4_2_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>>> -        uvd_v4_2_stop(adev);
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v4_2_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v4_2_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>>> index db6d06758e4d..563493d1f830 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>>> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>>> +        uvd_v5_0_stop(adev);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v5_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>>> -        uvd_v5_0_stop(adev);
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v5_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v5_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>>> index b6e82d75561f..1fd9ca21a091 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>>> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (!amdgpu_sriov_vf(adev))
>>>> +        uvd_v7_0_stop(adev);
>>>> +    else {
>>>> +        /* full access mode, so don't touch any UVD register */
>>>> +        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v7_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (!amdgpu_sriov_vf(adev))
>>>> -        uvd_v7_0_stop(adev);
>>>> -    else {
>>>> -        /* full access mode, so don't touch any UVD register */
>>>> -        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>>> -    }
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v7_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v7_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>>> index 84e488f189f5..67eb01fef789 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>>> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->vce.idle_work);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int vce_v2_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    return 0;
>>>> -}
>>>> -
>>>> -static int vce_v2_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = vce_v2_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>>> index 2a18c1e089fd..142e291983b4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>>> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>>>>       int r;
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->vce.idle_work);
>>>> +
>>>> +    r = vce_v3_0_wait_for_idle(handle);
>>>> +    if (r)
>>>> +        return r;
>>>> +
>>>> +    vce_v3_0_stop(adev);
>>>> +    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>>> +}
>>>> +
>>>> +static int vce_v3_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    r = vce_v3_0_wait_for_idle(handle);
>>>> -    if (r)
>>>> -        return r;
>>>> -
>>>> -    vce_v3_0_stop(adev);
>>>> -    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>>> -}
>>>> -
>>>> -static int vce_v3_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = vce_v3_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>>> index 044cf9d74b85..226b79254db8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>>> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   -    /*
>>>> -     * Proper cleanups before halting the HW engine:
>>>> -     *   - cancel the delayed idle work
>>>> -     *   - enable powergating
>>>> -     *   - enable clockgating
>>>> -     *   - disable dpm
>>>> -     *
>>>> -     * TODO: to align with the VCN implementation, move the
>>>> -     * jobs for clockgating/powergating/dpm setting to
>>>> -     * ->set_powergating_state().
>>>> -     */
>>>>       cancel_delayed_work_sync(&adev->vce.idle_work);
>>>>   -    if (adev->pm.dpm_enabled) {
>>>> -        amdgpu_dpm_enable_vce(adev, false);
>>>> -    } else {
>>>> -        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>>> -        amdgpu_device_ip_set_powergating_state(adev, 
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> -                               AMD_PG_STATE_GATE);
>>>> -        amdgpu_device_ip_set_clockgating_state(adev, 
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> -                               AMD_CG_STATE_GATE);
>>>> -    }
>>>> -
>>>>       if (!amdgpu_sriov_vf(adev)) {
>>>>           /* vce_v4_0_wait_for_idle(handle); */
>>>>           vce_v4_0_stop(adev);
>>>> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>>>>           drm_dev_exit(idx);
>>>>       }
>>>>   +    /*
>>>> +     * Proper cleanups before halting the HW engine:
>>>> +     *   - cancel the delayed idle work
>>>> +     *   - enable powergating
>>>> +     *   - enable clockgating
>>>> +     *   - disable dpm
>>>> +     *
>>>> +     * TODO: to align with the VCN implementation, move the
>>>> +     * jobs for clockgating/powergating/dpm setting to
>>>> +     * ->set_powergating_state().
>>>> +     */
>>>> +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>>> +
>>>> +    if (adev->pm.dpm_enabled) {
>>>> +        amdgpu_dpm_enable_vce(adev, false);
>>>> +    } else {
>>>> +        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>>> +        amdgpu_device_ip_set_powergating_state(adev, 
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> +                               AMD_PG_STATE_GATE);
>>>> +        amdgpu_device_ip_set_clockgating_state(adev, 
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> +                               AMD_CG_STATE_GATE);
>>>> +    }
>>>> +
>>>>       r = vce_v4_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload
  2021-09-16 16:17       ` Andrey Grodzovsky
@ 2021-09-16 16:40         ` Lazar, Lijo
  0 siblings, 0 replies; 7+ messages in thread
From: Lazar, Lijo @ 2021-09-16 16:40 UTC (permalink / raw)
  To: Grodzovsky, Andrey, amd-gfx; +Cc: Quan, Evan, Pan, Xinhui, Deucher, Alexander

[-- Attachment #1: Type: text/plain, Size: 15536 bytes --]

[Public]

Ah.. didn't notice that. As I mentioned, it's mainly a cosmetic/semantic thing. AFAIU, hw_fini() is expected to be called once when hw is removed/reset or driver is removed. Suspend is temporary and may be called multiple times during the lifetime. So calling hw_fini() from suspend() appears a bit odd (probably just for me).

If it's worth, something like this -

vce_4_2_suspend() {

__vce_4_2_suspend();
....
// Anything extra
}

vce_4_2_hw_fini() {

__vce_4_2_suspend();

}

Thanks,
Lijo
________________________________
From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Sent: Thursday, September 16, 2021 9:47:41 PM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Cc: Quan, Evan <Evan.Quan@amd.com>; Pan, Xinhui <Xinhui.Pan@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>
Subject: Re: [PATCH] drm/amdgpu: Fix crash on device remove/driver unload


On 2021-09-16 11:51 a.m., Lazar, Lijo wrote:
>
>
> On 9/16/2021 9:15 PM, Andrey Grodzovsky wrote:
>>
>> On 2021-09-16 4:20 a.m., Lazar, Lijo wrote:
>>> A minor comment below.
>>>
>>> On 9/16/2021 1:11 AM, Andrey Grodzovsky wrote:
>>>> Crash:
>>>> BUG: unable to handle page fault for address: 00000000000010e1
>>>> RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
>>>> Call Trace:
>>>> pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
>>>> amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
>>>> amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
>>>> vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
>>>> amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
>>>> amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
>>>> amdgpu_pci_remove+0x27/0x40 [amdgpu]
>>>> pci_device_remove+0x3e/0xb0
>>>> device_release_driver_internal+0x103/0x1d0
>>>> device_release_driver+0x12/0x20
>>>> pci_stop_bus_device+0x79/0xa0
>>>> pci_stop_and_remove_bus_device_locked+0x1b/0x30
>>>> remove_store+0x7b/0x90
>>>> dev_attr_store+0x17/0x30
>>>> sysfs_kf_write+0x4b/0x60
>>>> kernfs_fop_write_iter+0x151/0x1e0
>>>>
>>>> Why:
>>>> VCE/UVD had dependency on SMC block for their suspend but
>>>> SMC block is the first to do HW fini due to some constraints
>>>>
>>>> How:
>>>> Since the original patch was dealing with suspend issues
>>>> move the SMC block dependency back into suspend hooks as
>>>> was done in V1 of the original patches.
>>>> Keep flushing idle work both in suspend and HW fini seuqnces
>>>> since it's essential in both cases.
>>>>
>>>> Fixes:
>>>> 2178d3c189b9 drm/amdgpu: add missing cleanups for more ASICs on
>>>> UVD/VCE suspend
>>>> ee6679aaa61c drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE
>>>> on suspend
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c | 24 ++++++++-------
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 24 ++++++++-------
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 24 ++++++++-------
>>>>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 32 ++++++++++---------
>>>>   drivers/gpu/drm/amd/amdgpu/vce_v2_0.c | 19 +++++++-----
>>>>   drivers/gpu/drm/amd/amdgpu/vce_v3_0.c | 28 +++++++++--------
>>>>   drivers/gpu/drm/amd/amdgpu/vce_v4_0.c | 44
>>>> ++++++++++++++-------------
>>>>   7 files changed, 105 insertions(+), 90 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>>> index 7232241e3bfb..0fef925b6602 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
>>>> @@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>>> +        uvd_v3_1_stop(adev);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v3_1_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>>> -        uvd_v3_1_stop(adev);
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v3_1_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v3_1_hw_fini(adev);
>>>
>>> "cosmetic change" comment - hw_fini is supposed to be the final tear
>>> down call. So instead of suspend calling hw_fini, perhaps it makes
>>> sense to read the other way - hw_fini just suspends the hardware?
>>>
>>> Thanks,
>>> Lijo
>>
>>
>> Not sure what you mean ?
>
> Now it is - suspend()-> calls hw_fini()
>
> What I meant is hw_fini() -> calls suspend() and that is read as "to
> do hw_fini() only suspend the hardware and nothing extra is needed".
>
> In short implementation stays in suspend() and hw_fini() calls suspend().


Sorry, still confused, what about amdgpu_vce_suspend being called from
vce_v4_0_suspend for example - we don't want that to be called from hw_fini.
Can you maybe show draft change of what you mean for one specific UVD or
VCE version ?

Andrey


>
> Thanks,
> Lijo
>
>>
>> Andrey
>>
>>
>>>
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>>> index 52d6de969f46..c108b8381795 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
>>>> @@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>>> +        uvd_v4_2_stop(adev);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v4_2_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>>> -        uvd_v4_2_stop(adev);
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v4_2_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v4_2_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>>> index db6d06758e4d..563493d1f830 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
>>>> @@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (RREG32(mmUVD_STATUS) != 0)
>>>> +        uvd_v5_0_stop(adev);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v5_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (RREG32(mmUVD_STATUS) != 0)
>>>> -        uvd_v5_0_stop(adev);
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v5_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v5_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>>> index b6e82d75561f..1fd9ca21a091 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
>>>> @@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->uvd.idle_work);
>>>> +
>>>> +    if (!amdgpu_sriov_vf(adev))
>>>> +        uvd_v7_0_stop(adev);
>>>> +    else {
>>>> +        /* full access mode, so don't touch any UVD register */
>>>> +        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int uvd_v7_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    if (!amdgpu_sriov_vf(adev))
>>>> -        uvd_v7_0_stop(adev);
>>>> -    else {
>>>> -        /* full access mode, so don't touch any UVD register */
>>>> -        DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
>>>> -    }
>>>> -
>>>> -    return 0;
>>>> -}
>>>> -
>>>> -static int uvd_v7_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = uvd_v7_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>>> index 84e488f189f5..67eb01fef789 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
>>>> @@ -481,6 +481,17 @@ static int vce_v2_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->vce.idle_work);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int vce_v2_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -504,14 +515,6 @@ static int vce_v2_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    return 0;
>>>> -}
>>>> -
>>>> -static int vce_v2_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = vce_v2_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>>> index 2a18c1e089fd..142e291983b4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
>>>> @@ -492,6 +492,21 @@ static int vce_v3_0_hw_fini(void *handle)
>>>>       int r;
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   + cancel_delayed_work_sync(&adev->vce.idle_work);
>>>> +
>>>> +    r = vce_v3_0_wait_for_idle(handle);
>>>> +    if (r)
>>>> +        return r;
>>>> +
>>>> +    vce_v3_0_stop(adev);
>>>> +    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>>> +}
>>>> +
>>>> +static int vce_v3_0_suspend(void *handle)
>>>> +{
>>>> +    int r;
>>>> +    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> +
>>>>       /*
>>>>        * Proper cleanups before halting the HW engine:
>>>>        *   - cancel the delayed idle work
>>>> @@ -515,19 +530,6 @@ static int vce_v3_0_hw_fini(void *handle)
>>>>                                  AMD_CG_STATE_GATE);
>>>>       }
>>>>   -    r = vce_v3_0_wait_for_idle(handle);
>>>> -    if (r)
>>>> -        return r;
>>>> -
>>>> -    vce_v3_0_stop(adev);
>>>> -    return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
>>>> -}
>>>> -
>>>> -static int vce_v3_0_suspend(void *handle)
>>>> -{
>>>> -    int r;
>>>> -    struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>> -
>>>>       r = vce_v3_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>>> index 044cf9d74b85..226b79254db8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vce_v4_0.c
>>>> @@ -544,29 +544,8 @@ static int vce_v4_0_hw_fini(void *handle)
>>>>   {
>>>>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>   -    /*
>>>> -     * Proper cleanups before halting the HW engine:
>>>> -     *   - cancel the delayed idle work
>>>> -     *   - enable powergating
>>>> -     *   - enable clockgating
>>>> -     *   - disable dpm
>>>> -     *
>>>> -     * TODO: to align with the VCN implementation, move the
>>>> -     * jobs for clockgating/powergating/dpm setting to
>>>> -     * ->set_powergating_state().
>>>> -     */
>>>>       cancel_delayed_work_sync(&adev->vce.idle_work);
>>>>   -    if (adev->pm.dpm_enabled) {
>>>> -        amdgpu_dpm_enable_vce(adev, false);
>>>> -    } else {
>>>> -        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>>> -        amdgpu_device_ip_set_powergating_state(adev,
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> -                               AMD_PG_STATE_GATE);
>>>> -        amdgpu_device_ip_set_clockgating_state(adev,
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> -                               AMD_CG_STATE_GATE);
>>>> -    }
>>>> -
>>>>       if (!amdgpu_sriov_vf(adev)) {
>>>>           /* vce_v4_0_wait_for_idle(handle); */
>>>>           vce_v4_0_stop(adev);
>>>> @@ -596,6 +575,29 @@ static int vce_v4_0_suspend(void *handle)
>>>>           drm_dev_exit(idx);
>>>>       }
>>>>   +    /*
>>>> +     * Proper cleanups before halting the HW engine:
>>>> +     *   - cancel the delayed idle work
>>>> +     *   - enable powergating
>>>> +     *   - enable clockgating
>>>> +     *   - disable dpm
>>>> +     *
>>>> +     * TODO: to align with the VCN implementation, move the
>>>> +     * jobs for clockgating/powergating/dpm setting to
>>>> +     * ->set_powergating_state().
>>>> +     */
>>>> +    cancel_delayed_work_sync(&adev->vce.idle_work);
>>>> +
>>>> +    if (adev->pm.dpm_enabled) {
>>>> +        amdgpu_dpm_enable_vce(adev, false);
>>>> +    } else {
>>>> +        amdgpu_asic_set_vce_clocks(adev, 0, 0);
>>>> +        amdgpu_device_ip_set_powergating_state(adev,
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> +                               AMD_PG_STATE_GATE);
>>>> +        amdgpu_device_ip_set_clockgating_state(adev,
>>>> AMD_IP_BLOCK_TYPE_VCE,
>>>> +                               AMD_CG_STATE_GATE);
>>>> +    }
>>>> +
>>>>       r = vce_v4_0_hw_fini(adev);
>>>>       if (r)
>>>>           return r;
>>>>

[-- Attachment #2: Type: text/html, Size: 29898 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-09-16 16:41 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-15 19:41 [PATCH] drm/amdgpu: Fix crash on device remove/driver unload Andrey Grodzovsky
2021-09-16  7:26 ` Pan, Xinhui
2021-09-16  8:20 ` Lazar, Lijo
2021-09-16 15:45   ` Andrey Grodzovsky
2021-09-16 15:51     ` Lazar, Lijo
2021-09-16 16:17       ` Andrey Grodzovsky
2021-09-16 16:40         ` Lazar, Lijo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.