dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume
@ 2024-02-06 15:19 Jacek Lawrynowicz
  2024-02-07 10:24 ` [PATCH v2] accel/ivpu: Fix DevTLB errors on suspend/resume and recovery Jacek Lawrynowicz
  2024-02-12  8:02 ` [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume Jacek Lawrynowicz
  0 siblings, 2 replies; 5+ messages in thread
From: Jacek Lawrynowicz @ 2024-02-06 15:19 UTC (permalink / raw)
  To: dri-devel; +Cc: oded.gabbay, quic_jhugo, Jacek Lawrynowicz

Issue IP reset before shutdown in order to
complete all upstream requests to the SOC.
Without this DevTLB is complaining about
incomplete transactions and NPU cannot resume from
suspend.
This problem is only happening on recent IFWI
releases.

IP reset in rare corner cases can mess up PCI
configuration, so save it before the reset.
After this happens it is also impossible to
issue PLL requests and D0->D3->D0 cycle is needed
to recover the NPU. Add WP 0 request on power up,
so the PUNIT is always notified about NPU reset.

Fixes: 3f7c0634926d ("accel/ivpu/37xx: Fix hangs related to MMIO reset")
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
---
 drivers/accel/ivpu/ivpu_hw_37xx.c | 44 ++++++++++++++++++++++---------
 drivers/accel/ivpu/ivpu_pm.c      | 12 ++++-----
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c
index 77accd029c4a..89af1006df55 100644
--- a/drivers/accel/ivpu/ivpu_hw_37xx.c
+++ b/drivers/accel/ivpu/ivpu_hw_37xx.c
@@ -510,16 +510,6 @@ static int ivpu_boot_pwr_domain_enable(struct ivpu_device *vdev)
 	return ret;
 }
 
-static int ivpu_boot_pwr_domain_disable(struct ivpu_device *vdev)
-{
-	ivpu_boot_dpu_active_drive(vdev, false);
-	ivpu_boot_pwr_island_isolation_drive(vdev, true);
-	ivpu_boot_pwr_island_trickle_drive(vdev, false);
-	ivpu_boot_pwr_island_drive(vdev, false);
-
-	return ivpu_boot_wait_for_pwr_island_status(vdev, 0x0);
-}
-
 static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev)
 {
 	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES);
@@ -616,12 +606,37 @@ static int ivpu_hw_37xx_info_init(struct ivpu_device *vdev)
 	return 0;
 }
 
+static int ivpu_hw_37xx_ip_reset(struct ivpu_device *vdev)
+{
+	int ret;
+	u32 val;
+
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret) {
+		ivpu_err(vdev, "Timed out waiting for TRIGGER bit\n");
+		return ret;
+	}
+
+	val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_IP_RESET);
+	val = REG_SET_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, val);
+	REGB_WR32(VPU_37XX_BUTTRESS_VPU_IP_RESET, val);
+
+	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret)
+		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
+
+	return ret;
+}
+
 static int ivpu_hw_37xx_reset(struct ivpu_device *vdev)
 {
 	int ret = 0;
 
-	if (ivpu_boot_pwr_domain_disable(vdev)) {
-		ivpu_err(vdev, "Failed to disable power domain\n");
+	if (ivpu_hw_37xx_ip_reset(vdev)) {
+		ivpu_err(vdev, "Failed to reset NPU\n");
 		ret = -EIO;
 	}
 
@@ -661,6 +676,11 @@ static int ivpu_hw_37xx_power_up(struct ivpu_device *vdev)
 {
 	int ret;
 
+	/* PLL requests may fail when powering down, so issue WP 0 here */
+	ret = ivpu_pll_disable(vdev);
+	if (ret)
+		ivpu_warn(vdev, "Failed to disable PLL: %d\n", ret);
+
 	ret = ivpu_hw_37xx_d0i3_disable(vdev);
 	if (ret)
 		ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret);
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index f501f27ebafd..fcc319ee0018 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -58,11 +58,14 @@ static int ivpu_suspend(struct ivpu_device *vdev)
 {
 	int ret;
 
+	/* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */
+	pci_save_state(to_pci_dev(vdev->drm.dev));
+
 	ret = ivpu_shutdown(vdev);
-	if (ret) {
+	if (ret)
 		ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret);
-		return ret;
-	}
+
+	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
 
 	return ret;
 }
@@ -200,9 +203,6 @@ int ivpu_pm_suspend_cb(struct device *dev)
 	ivpu_suspend(vdev);
 	ivpu_pm_prepare_warm_boot(vdev);
 
-	pci_save_state(to_pci_dev(dev));
-	pci_set_power_state(to_pci_dev(dev), PCI_D3hot);
-
 	ivpu_dbg(vdev, PM, "Suspend done.\n");
 
 	return 0;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v2] accel/ivpu: Fix DevTLB errors on suspend/resume and recovery
  2024-02-06 15:19 [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume Jacek Lawrynowicz
@ 2024-02-07 10:24 ` Jacek Lawrynowicz
  2024-02-09 15:39   ` Jeffrey Hugo
  2024-02-12  8:02 ` [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume Jacek Lawrynowicz
  1 sibling, 1 reply; 5+ messages in thread
From: Jacek Lawrynowicz @ 2024-02-07 10:24 UTC (permalink / raw)
  To: dri-devel; +Cc: oded.gabbay, quic_jhugo, Jacek Lawrynowicz

Issue IP reset before shutdown in order to
complete all upstream requests to the SOC.
Without this DevTLB is complaining about
incomplete transactions and NPU cannot resume from
suspend.
This problem is only happening on recent IFWI
releases.

IP reset in rare corner cases can mess up PCI
configuration, so save it before the reset.
After this happens it is also impossible to
issue PLL requests and D0->D3->D0 cycle is needed
to recover the NPU. Add WP 0 request on power up,
so the PUNIT is always notified about NPU reset.

Use D0/D3 cycle for recovery as it can recover
from failed IP reset and FLR cannot.

Fixes: 3f7c0634926d ("accel/ivpu/37xx: Fix hangs related to MMIO reset")
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
---
 drivers/accel/ivpu/ivpu_hw_37xx.c | 44 ++++++++++++++++++++++---------
 drivers/accel/ivpu/ivpu_pm.c      | 39 +++++++++++++++------------
 2 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c
index 77accd029c4a..89af1006df55 100644
--- a/drivers/accel/ivpu/ivpu_hw_37xx.c
+++ b/drivers/accel/ivpu/ivpu_hw_37xx.c
@@ -510,16 +510,6 @@ static int ivpu_boot_pwr_domain_enable(struct ivpu_device *vdev)
 	return ret;
 }
 
-static int ivpu_boot_pwr_domain_disable(struct ivpu_device *vdev)
-{
-	ivpu_boot_dpu_active_drive(vdev, false);
-	ivpu_boot_pwr_island_isolation_drive(vdev, true);
-	ivpu_boot_pwr_island_trickle_drive(vdev, false);
-	ivpu_boot_pwr_island_drive(vdev, false);
-
-	return ivpu_boot_wait_for_pwr_island_status(vdev, 0x0);
-}
-
 static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev)
 {
 	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES);
@@ -616,12 +606,37 @@ static int ivpu_hw_37xx_info_init(struct ivpu_device *vdev)
 	return 0;
 }
 
+static int ivpu_hw_37xx_ip_reset(struct ivpu_device *vdev)
+{
+	int ret;
+	u32 val;
+
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret) {
+		ivpu_err(vdev, "Timed out waiting for TRIGGER bit\n");
+		return ret;
+	}
+
+	val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_IP_RESET);
+	val = REG_SET_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, val);
+	REGB_WR32(VPU_37XX_BUTTRESS_VPU_IP_RESET, val);
+
+	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret)
+		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
+
+	return ret;
+}
+
 static int ivpu_hw_37xx_reset(struct ivpu_device *vdev)
 {
 	int ret = 0;
 
-	if (ivpu_boot_pwr_domain_disable(vdev)) {
-		ivpu_err(vdev, "Failed to disable power domain\n");
+	if (ivpu_hw_37xx_ip_reset(vdev)) {
+		ivpu_err(vdev, "Failed to reset NPU\n");
 		ret = -EIO;
 	}
 
@@ -661,6 +676,11 @@ static int ivpu_hw_37xx_power_up(struct ivpu_device *vdev)
 {
 	int ret;
 
+	/* PLL requests may fail when powering down, so issue WP 0 here */
+	ret = ivpu_pll_disable(vdev);
+	if (ret)
+		ivpu_warn(vdev, "Failed to disable PLL: %d\n", ret);
+
 	ret = ivpu_hw_37xx_d0i3_disable(vdev);
 	if (ret)
 		ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret);
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index f501f27ebafd..5f73854234ba 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -58,11 +58,14 @@ static int ivpu_suspend(struct ivpu_device *vdev)
 {
 	int ret;
 
+	/* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */
+	pci_save_state(to_pci_dev(vdev->drm.dev));
+
 	ret = ivpu_shutdown(vdev);
-	if (ret) {
+	if (ret)
 		ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret);
-		return ret;
-	}
+
+	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
 
 	return ret;
 }
@@ -71,6 +74,9 @@ static int ivpu_resume(struct ivpu_device *vdev)
 {
 	int ret;
 
+	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
+	pci_restore_state(to_pci_dev(vdev->drm.dev));
+
 retry:
 	ret = ivpu_hw_power_up(vdev);
 	if (ret) {
@@ -120,15 +126,20 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
 
 	ivpu_fw_log_dump(vdev);
 
-retry:
-	ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
-	if (ret == -EAGAIN && !drm_dev_is_unplugged(&vdev->drm)) {
-		cond_resched();
-		goto retry;
-	}
+	atomic_inc(&vdev->pm->reset_counter);
+	atomic_set(&vdev->pm->reset_pending, 1);
+	down_write(&vdev->pm->reset_lock);
+
+	ivpu_suspend(vdev);
+	ivpu_pm_prepare_cold_boot(vdev);
+	ivpu_jobs_abort_all(vdev);
+
+	ret = ivpu_resume(vdev);
+	if (ret)
+		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
 
-	if (ret && ret != -EAGAIN)
-		ivpu_err(vdev, "Failed to reset VPU: %d\n", ret);
+	up_write(&vdev->pm->reset_lock);
+	atomic_set(&vdev->pm->reset_pending, 0);
 
 	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
 	pm_runtime_mark_last_busy(vdev->drm.dev);
@@ -200,9 +211,6 @@ int ivpu_pm_suspend_cb(struct device *dev)
 	ivpu_suspend(vdev);
 	ivpu_pm_prepare_warm_boot(vdev);
 
-	pci_save_state(to_pci_dev(dev));
-	pci_set_power_state(to_pci_dev(dev), PCI_D3hot);
-
 	ivpu_dbg(vdev, PM, "Suspend done.\n");
 
 	return 0;
@@ -216,9 +224,6 @@ int ivpu_pm_resume_cb(struct device *dev)
 
 	ivpu_dbg(vdev, PM, "Resume..\n");
 
-	pci_set_power_state(to_pci_dev(dev), PCI_D0);
-	pci_restore_state(to_pci_dev(dev));
-
 	ret = ivpu_resume(vdev);
 	if (ret)
 		ivpu_err(vdev, "Failed to resume: %d\n", ret);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] accel/ivpu: Fix DevTLB errors on suspend/resume and recovery
  2024-02-07 10:24 ` [PATCH v2] accel/ivpu: Fix DevTLB errors on suspend/resume and recovery Jacek Lawrynowicz
@ 2024-02-09 15:39   ` Jeffrey Hugo
  2024-02-12  7:59     ` Jacek Lawrynowicz
  0 siblings, 1 reply; 5+ messages in thread
From: Jeffrey Hugo @ 2024-02-09 15:39 UTC (permalink / raw)
  To: Jacek Lawrynowicz, dri-devel; +Cc: oded.gabbay

On 2/7/2024 3:24 AM, Jacek Lawrynowicz wrote:
> Issue IP reset before shutdown in order to
> complete all upstream requests to the SOC.
> Without this DevTLB is complaining about
> incomplete transactions and NPU cannot resume from
> suspend.
> This problem is only happening on recent IFWI
> releases.
> 
> IP reset in rare corner cases can mess up PCI
> configuration, so save it before the reset.
> After this happens it is also impossible to
> issue PLL requests and D0->D3->D0 cycle is needed
> to recover the NPU. Add WP 0 request on power up,
> so the PUNIT is always notified about NPU reset.
> 
> Use D0/D3 cycle for recovery as it can recover
> from failed IP reset and FLR cannot.
> 
> Fixes: 3f7c0634926d ("accel/ivpu/37xx: Fix hangs related to MMIO reset")
> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
> ---

Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>

Nit below

>   drivers/accel/ivpu/ivpu_hw_37xx.c | 44 ++++++++++++++++++++++---------
>   drivers/accel/ivpu/ivpu_pm.c      | 39 +++++++++++++++------------
>   2 files changed, 54 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c
> index 77accd029c4a..89af1006df55 100644
> --- a/drivers/accel/ivpu/ivpu_hw_37xx.c
> +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c
> @@ -510,16 +510,6 @@ static int ivpu_boot_pwr_domain_enable(struct ivpu_device *vdev)
>   	return ret;
>   }
>   
> -static int ivpu_boot_pwr_domain_disable(struct ivpu_device *vdev)
> -{
> -	ivpu_boot_dpu_active_drive(vdev, false);
> -	ivpu_boot_pwr_island_isolation_drive(vdev, true);
> -	ivpu_boot_pwr_island_trickle_drive(vdev, false);
> -	ivpu_boot_pwr_island_drive(vdev, false);
> -
> -	return ivpu_boot_wait_for_pwr_island_status(vdev, 0x0);
> -}
> -
>   static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev)
>   {
>   	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES);
> @@ -616,12 +606,37 @@ static int ivpu_hw_37xx_info_init(struct ivpu_device *vdev)
>   	return 0;
>   }
>   
> +static int ivpu_hw_37xx_ip_reset(struct ivpu_device *vdev)
> +{
> +	int ret;
> +	u32 val;
> +
> +	if (IVPU_WA(punit_disabled))
> +		return 0;
> +
> +	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
> +	if (ret) {
> +		ivpu_err(vdev, "Timed out waiting for TRIGGER bit\n");
> +		return ret;
> +	}
> +
> +	val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_IP_RESET);
> +	val = REG_SET_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, val);
> +	REGB_WR32(VPU_37XX_BUTTRESS_VPU_IP_RESET, val);
> +
> +	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
> +	if (ret)
> +		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
> +
> +	return ret;
> +}
> +
>   static int ivpu_hw_37xx_reset(struct ivpu_device *vdev)
>   {
>   	int ret = 0;
>   
> -	if (ivpu_boot_pwr_domain_disable(vdev)) {
> -		ivpu_err(vdev, "Failed to disable power domain\n");
> +	if (ivpu_hw_37xx_ip_reset(vdev)) {
> +		ivpu_err(vdev, "Failed to reset NPU\n");
>   		ret = -EIO;
>   	}
>   
> @@ -661,6 +676,11 @@ static int ivpu_hw_37xx_power_up(struct ivpu_device *vdev)
>   {
>   	int ret;
>   
> +	/* PLL requests may fail when powering down, so issue WP 0 here */
> +	ret = ivpu_pll_disable(vdev);
> +	if (ret)
> +		ivpu_warn(vdev, "Failed to disable PLL: %d\n", ret);
> +
>   	ret = ivpu_hw_37xx_d0i3_disable(vdev);
>   	if (ret)
>   		ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret);
> diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
> index f501f27ebafd..5f73854234ba 100644
> --- a/drivers/accel/ivpu/ivpu_pm.c
> +++ b/drivers/accel/ivpu/ivpu_pm.c
> @@ -58,11 +58,14 @@ static int ivpu_suspend(struct ivpu_device *vdev)
>   {
>   	int ret;
>   
> +	/* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */
> +	pci_save_state(to_pci_dev(vdev->drm.dev));
> +
>   	ret = ivpu_shutdown(vdev);
> -	if (ret) {
> +	if (ret)
>   		ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret);

In the two logs you add in this change, the log has "NPU".  Here, there 
is "VPU".  As far as I understand VPU is the old term and NPU is the new 
term therefore it seems like all the logs should be updated to use the 
new term for consistency.  Outside of scope for this change though.

> -		return ret;
> -	}
> +
> +	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
>   
>   	return ret;
>   }
> @@ -71,6 +74,9 @@ static int ivpu_resume(struct ivpu_device *vdev)
>   {
>   	int ret;
>   
> +	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
> +	pci_restore_state(to_pci_dev(vdev->drm.dev));
> +
>   retry:
>   	ret = ivpu_hw_power_up(vdev);
>   	if (ret) {
> @@ -120,15 +126,20 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
>   
>   	ivpu_fw_log_dump(vdev);
>   
> -retry:
> -	ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
> -	if (ret == -EAGAIN && !drm_dev_is_unplugged(&vdev->drm)) {
> -		cond_resched();
> -		goto retry;
> -	}
> +	atomic_inc(&vdev->pm->reset_counter);
> +	atomic_set(&vdev->pm->reset_pending, 1);
> +	down_write(&vdev->pm->reset_lock);
> +
> +	ivpu_suspend(vdev);
> +	ivpu_pm_prepare_cold_boot(vdev);
> +	ivpu_jobs_abort_all(vdev);
> +
> +	ret = ivpu_resume(vdev);
> +	if (ret)
> +		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
>   
> -	if (ret && ret != -EAGAIN)
> -		ivpu_err(vdev, "Failed to reset VPU: %d\n", ret);
> +	up_write(&vdev->pm->reset_lock);
> +	atomic_set(&vdev->pm->reset_pending, 0);
>   
>   	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
>   	pm_runtime_mark_last_busy(vdev->drm.dev);
> @@ -200,9 +211,6 @@ int ivpu_pm_suspend_cb(struct device *dev)
>   	ivpu_suspend(vdev);
>   	ivpu_pm_prepare_warm_boot(vdev);
>   
> -	pci_save_state(to_pci_dev(dev));
> -	pci_set_power_state(to_pci_dev(dev), PCI_D3hot);
> -
>   	ivpu_dbg(vdev, PM, "Suspend done.\n");
>   
>   	return 0;
> @@ -216,9 +224,6 @@ int ivpu_pm_resume_cb(struct device *dev)
>   
>   	ivpu_dbg(vdev, PM, "Resume..\n");
>   
> -	pci_set_power_state(to_pci_dev(dev), PCI_D0);
> -	pci_restore_state(to_pci_dev(dev));
> -
>   	ret = ivpu_resume(vdev);
>   	if (ret)
>   		ivpu_err(vdev, "Failed to resume: %d\n", ret);


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] accel/ivpu: Fix DevTLB errors on suspend/resume and recovery
  2024-02-09 15:39   ` Jeffrey Hugo
@ 2024-02-12  7:59     ` Jacek Lawrynowicz
  0 siblings, 0 replies; 5+ messages in thread
From: Jacek Lawrynowicz @ 2024-02-12  7:59 UTC (permalink / raw)
  To: Jeffrey Hugo, dri-devel; +Cc: oded.gabbay

Hi,

On 09.02.2024 16:39, Jeffrey Hugo wrote:
> On 2/7/2024 3:24 AM, Jacek Lawrynowicz wrote:
>> Issue IP reset before shutdown in order to
>> complete all upstream requests to the SOC.
>> Without this DevTLB is complaining about
>> incomplete transactions and NPU cannot resume from
>> suspend.
>> This problem is only happening on recent IFWI
>> releases.
>>
>> IP reset in rare corner cases can mess up PCI
>> configuration, so save it before the reset.
>> After this happens it is also impossible to
>> issue PLL requests and D0->D3->D0 cycle is needed
>> to recover the NPU. Add WP 0 request on power up,
>> so the PUNIT is always notified about NPU reset.
>>
>> Use D0/D3 cycle for recovery as it can recover
>> from failed IP reset and FLR cannot.
>>
>> Fixes: 3f7c0634926d ("accel/ivpu/37xx: Fix hangs related to MMIO reset")
>> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
>> ---
> 
> Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
> 
> Nit below
> 

>>       ret = ivpu_shutdown(vdev);
>> -    if (ret) {
>> +    if (ret)
>>           ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret);
> 
> In the two logs you add in this change, the log has "NPU".  Here, there is "VPU".  As far as I understand VPU is the old term and NPU is the new term therefore it seems like all the logs should be updated to use the new term for consistency.  Outside of scope for this change though.

Ok, I will fix this in next patchset.

Thanks,
Jacek

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume
  2024-02-06 15:19 [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume Jacek Lawrynowicz
  2024-02-07 10:24 ` [PATCH v2] accel/ivpu: Fix DevTLB errors on suspend/resume and recovery Jacek Lawrynowicz
@ 2024-02-12  8:02 ` Jacek Lawrynowicz
  1 sibling, 0 replies; 5+ messages in thread
From: Jacek Lawrynowicz @ 2024-02-12  8:02 UTC (permalink / raw)
  To: dri-devel; +Cc: oded.gabbay, quic_jhugo

Applied to drm-misc-fixes

On 06.02.2024 16:19, Jacek Lawrynowicz wrote:
> Issue IP reset before shutdown in order to
> complete all upstream requests to the SOC.
> Without this DevTLB is complaining about
> incomplete transactions and NPU cannot resume from
> suspend.
> This problem is only happening on recent IFWI
> releases.
> 
> IP reset in rare corner cases can mess up PCI
> configuration, so save it before the reset.
> After this happens it is also impossible to
> issue PLL requests and D0->D3->D0 cycle is needed
> to recover the NPU. Add WP 0 request on power up,
> so the PUNIT is always notified about NPU reset.
> 
> Fixes: 3f7c0634926d ("accel/ivpu/37xx: Fix hangs related to MMIO reset")
> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
> ---
>  drivers/accel/ivpu/ivpu_hw_37xx.c | 44 ++++++++++++++++++++++---------
>  drivers/accel/ivpu/ivpu_pm.c      | 12 ++++-----
>  2 files changed, 38 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c
> index 77accd029c4a..89af1006df55 100644
> --- a/drivers/accel/ivpu/ivpu_hw_37xx.c
> +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c
> @@ -510,16 +510,6 @@ static int ivpu_boot_pwr_domain_enable(struct ivpu_device *vdev)
>  	return ret;
>  }
>  
> -static int ivpu_boot_pwr_domain_disable(struct ivpu_device *vdev)
> -{
> -	ivpu_boot_dpu_active_drive(vdev, false);
> -	ivpu_boot_pwr_island_isolation_drive(vdev, true);
> -	ivpu_boot_pwr_island_trickle_drive(vdev, false);
> -	ivpu_boot_pwr_island_drive(vdev, false);
> -
> -	return ivpu_boot_wait_for_pwr_island_status(vdev, 0x0);
> -}
> -
>  static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev)
>  {
>  	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES);
> @@ -616,12 +606,37 @@ static int ivpu_hw_37xx_info_init(struct ivpu_device *vdev)
>  	return 0;
>  }
>  
> +static int ivpu_hw_37xx_ip_reset(struct ivpu_device *vdev)
> +{
> +	int ret;
> +	u32 val;
> +
> +	if (IVPU_WA(punit_disabled))
> +		return 0;
> +
> +	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
> +	if (ret) {
> +		ivpu_err(vdev, "Timed out waiting for TRIGGER bit\n");
> +		return ret;
> +	}
> +
> +	val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_IP_RESET);
> +	val = REG_SET_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, val);
> +	REGB_WR32(VPU_37XX_BUTTRESS_VPU_IP_RESET, val);
> +
> +	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
> +	if (ret)
> +		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
> +
> +	return ret;
> +}
> +
>  static int ivpu_hw_37xx_reset(struct ivpu_device *vdev)
>  {
>  	int ret = 0;
>  
> -	if (ivpu_boot_pwr_domain_disable(vdev)) {
> -		ivpu_err(vdev, "Failed to disable power domain\n");
> +	if (ivpu_hw_37xx_ip_reset(vdev)) {
> +		ivpu_err(vdev, "Failed to reset NPU\n");
>  		ret = -EIO;
>  	}
>  
> @@ -661,6 +676,11 @@ static int ivpu_hw_37xx_power_up(struct ivpu_device *vdev)
>  {
>  	int ret;
>  
> +	/* PLL requests may fail when powering down, so issue WP 0 here */
> +	ret = ivpu_pll_disable(vdev);
> +	if (ret)
> +		ivpu_warn(vdev, "Failed to disable PLL: %d\n", ret);
> +
>  	ret = ivpu_hw_37xx_d0i3_disable(vdev);
>  	if (ret)
>  		ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret);
> diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
> index f501f27ebafd..fcc319ee0018 100644
> --- a/drivers/accel/ivpu/ivpu_pm.c
> +++ b/drivers/accel/ivpu/ivpu_pm.c
> @@ -58,11 +58,14 @@ static int ivpu_suspend(struct ivpu_device *vdev)
>  {
>  	int ret;
>  
> +	/* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */
> +	pci_save_state(to_pci_dev(vdev->drm.dev));
> +
>  	ret = ivpu_shutdown(vdev);
> -	if (ret) {
> +	if (ret)
>  		ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret);
> -		return ret;
> -	}
> +
> +	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
>  
>  	return ret;
>  }
> @@ -200,9 +203,6 @@ int ivpu_pm_suspend_cb(struct device *dev)
>  	ivpu_suspend(vdev);
>  	ivpu_pm_prepare_warm_boot(vdev);
>  
> -	pci_save_state(to_pci_dev(dev));
> -	pci_set_power_state(to_pci_dev(dev), PCI_D3hot);
> -
>  	ivpu_dbg(vdev, PM, "Suspend done.\n");
>  
>  	return 0;

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-02-12  8:04 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-06 15:19 [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume Jacek Lawrynowicz
2024-02-07 10:24 ` [PATCH v2] accel/ivpu: Fix DevTLB errors on suspend/resume and recovery Jacek Lawrynowicz
2024-02-09 15:39   ` Jeffrey Hugo
2024-02-12  7:59     ` Jacek Lawrynowicz
2024-02-12  8:02 ` [PATCH] accel/ivpu: Fix DevTLB errors on suspend/resume Jacek Lawrynowicz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).