linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
@ 2021-06-10  4:43 ` Can Guo
  2021-06-10 11:15   ` Adrian Hunter
                     ` (2 more replies)
  2021-06-10  4:43 ` [PATCH v3 2/9] scsi: ufs: Update the return value of supplier " Can Guo
                   ` (7 subsequent siblings)
  8 siblings, 3 replies; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, Bart Van Assche,
	open list

Put pm_op_in_progress and is_sys_suspend flags back to ufshcd hba pm ops,
add two new flags, namely wl_pm_op_in_progress and is_wl_sys_suspended, to
track the UFS device W-LU pm ops. This helps us differentiate the status of
hba and wl pm ops when we need to do troubleshooting.

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 42 ++++++++++++++++++++++++++++--------------
 drivers/scsi/ufs/ufshcd.h |  4 +++-
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 25fe18a..47b2a9a 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -549,7 +549,9 @@ static void ufshcd_print_host_state(struct ufs_hba *hba)
 		hba->saved_err, hba->saved_uic_err);
 	dev_err(hba->dev, "Device power mode=%d, UIC link state=%d\n",
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
-	dev_err(hba->dev, "PM in progress=%d, sys. suspended=%d\n",
+	dev_err(hba->dev, "wl_pm_op_in_progress=%d, is_wl_sys_suspended=%d\n",
+		hba->wl_pm_op_in_progress, hba->is_wl_sys_suspended);
+	dev_err(hba->dev, "pm_op_in_progress=%d, is_sys_suspended=%d\n",
 		hba->pm_op_in_progress, hba->is_sys_suspended);
 	dev_err(hba->dev, "Auto BKOPS=%d, Host self-block=%d\n",
 		hba->auto_bkops_enabled, hba->host->host_self_blocked);
@@ -1999,7 +2001,7 @@ static void ufshcd_clk_scaling_start_busy(struct ufs_hba *hba)
 	if (!hba->clk_scaling.active_reqs++)
 		queue_resume_work = true;
 
-	if (!hba->clk_scaling.is_enabled || hba->pm_op_in_progress) {
+	if (!hba->clk_scaling.is_enabled || hba->wl_pm_op_in_progress) {
 		spin_unlock_irqrestore(hba->host->host_lock, flags);
 		return;
 	}
@@ -2734,7 +2736,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		 * err handler blocked for too long. So, just fail the scsi cmd
 		 * sent from PM ops, err handler can recover PM error anyways.
 		 */
-		if (hba->pm_op_in_progress) {
+		if (hba->wl_pm_op_in_progress) {
 			hba->force_reset = true;
 			set_host_byte(cmd, DID_BAD_TARGET);
 			cmd->scsi_done(cmd);
@@ -2767,7 +2769,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		(hba->clk_gating.state != CLKS_ON));
 
 	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
-		if (hba->pm_op_in_progress)
+		if (hba->wl_pm_op_in_progress)
 			set_host_byte(cmd, DID_BAD_TARGET);
 		else
 			err = SCSI_MLQUEUE_HOST_BUSY;
@@ -5116,7 +5118,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 			 * solution could be to abort the system suspend if
 			 * UFS device needs urgent BKOPs.
 			 */
-			if (!hba->pm_op_in_progress &&
+			if (!hba->wl_pm_op_in_progress &&
 			    !ufshcd_eh_in_progress(hba) &&
 			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr))
 				/* Flushed in suspend */
@@ -5916,7 +5918,7 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 {
 	ufshcd_rpm_get_sync(hba);
 	if (pm_runtime_status_suspended(&hba->sdev_ufs_device->sdev_gendev) ||
-	    hba->is_sys_suspended) {
+	    hba->is_wl_sys_suspended) {
 		enum ufs_pm_op pm_op;
 
 		/*
@@ -5933,7 +5935,7 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 		if (!ufshcd_is_clkgating_allowed(hba))
 			ufshcd_setup_clocks(hba, true);
 		ufshcd_release(hba);
-		pm_op = hba->is_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
+		pm_op = hba->is_wl_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
 		ufshcd_vops_resume(hba, pm_op);
 	} else {
 		ufshcd_hold(hba, false);
@@ -5976,7 +5978,7 @@ static void ufshcd_recover_pm_error(struct ufs_hba *hba)
 	struct request_queue *q;
 	int ret;
 
-	hba->is_sys_suspended = false;
+	hba->is_wl_sys_suspended = false;
 	/*
 	 * Set RPM status of wlun device to RPM_ACTIVE,
 	 * this also clears its runtime error.
@@ -8784,7 +8786,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 	enum ufs_dev_pwr_mode req_dev_pwr_mode;
 	enum uic_link_state req_link_state;
 
-	hba->pm_op_in_progress = true;
+	hba->wl_pm_op_in_progress = true;
 	if (pm_op != UFS_SHUTDOWN_PM) {
 		pm_lvl = pm_op == UFS_RUNTIME_PM ?
 			 hba->rpm_lvl : hba->spm_lvl;
@@ -8919,7 +8921,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 		hba->clk_gating.is_suspended = false;
 		ufshcd_release(hba);
 	}
-	hba->pm_op_in_progress = false;
+	hba->wl_pm_op_in_progress = false;
 	return ret;
 }
 
@@ -8928,7 +8930,7 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 	int ret;
 	enum uic_link_state old_link_state = hba->uic_link_state;
 
-	hba->pm_op_in_progress = true;
+	hba->wl_pm_op_in_progress = true;
 
 	/*
 	 * Call vendor specific resume callback. As these callbacks may access
@@ -9006,7 +9008,7 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 		ufshcd_update_evt_hist(hba, UFS_EVT_WL_RES_ERR, (u32)ret);
 	hba->clk_gating.is_suspended = false;
 	ufshcd_release(hba);
-	hba->pm_op_in_progress = false;
+	hba->wl_pm_op_in_progress = false;
 	return ret;
 }
 
@@ -9072,7 +9074,7 @@ static int ufshcd_wl_suspend(struct device *dev)
 
 out:
 	if (!ret)
-		hba->is_sys_suspended = true;
+		hba->is_wl_sys_suspended = true;
 	trace_ufshcd_wl_suspend(dev_name(dev), ret,
 		ktime_to_us(ktime_sub(ktime_get(), start)),
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
@@ -9100,7 +9102,7 @@ static int ufshcd_wl_resume(struct device *dev)
 		ktime_to_us(ktime_sub(ktime_get(), start)),
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
-		hba->is_sys_suspended = false;
+		hba->is_wl_sys_suspended = false;
 	up(&hba->host_sem);
 	return ret;
 }
@@ -9141,6 +9143,8 @@ static int ufshcd_suspend(struct ufs_hba *hba)
 
 	if (!hba->is_powered)
 		return 0;
+
+	hba->pm_op_in_progress = true;
 	/*
 	 * Disable the host irq as host controller as there won't be any
 	 * host controller transaction expected till resume.
@@ -9160,6 +9164,7 @@ static int ufshcd_suspend(struct ufs_hba *hba)
 	ufshcd_vreg_set_lpm(hba);
 	/* Put the host controller in low power mode if possible */
 	ufshcd_hba_vreg_set_lpm(hba);
+	hba->pm_op_in_progress = false;
 	return ret;
 }
 
@@ -9179,6 +9184,7 @@ static int ufshcd_resume(struct ufs_hba *hba)
 	if (!hba->is_powered)
 		return 0;
 
+	hba->pm_op_in_progress = true;
 	ufshcd_hba_vreg_set_hpm(hba);
 	ret = ufshcd_vreg_set_hpm(hba);
 	if (ret)
@@ -9198,6 +9204,7 @@ static int ufshcd_resume(struct ufs_hba *hba)
 out:
 	if (ret)
 		ufshcd_update_evt_hist(hba, UFS_EVT_RESUME_ERR, (u32)ret);
+	hba->pm_op_in_progress = false;
 	return ret;
 }
 
@@ -9222,6 +9229,10 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
 	trace_ufshcd_system_suspend(dev_name(hba->dev), ret,
 		ktime_to_us(ktime_sub(ktime_get(), start)),
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
+
+	if (!ret)
+		hba->is_sys_suspended = true;
+
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_suspend);
@@ -9248,6 +9259,9 @@ int ufshcd_system_resume(struct ufs_hba *hba)
 		ktime_to_us(ktime_sub(ktime_get(), start)),
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 
+	if (!ret)
+		hba->is_sys_suspended = false;
+
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_resume);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index c98d540..eaebb4e 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -752,7 +752,8 @@ struct ufs_hba {
 	enum ufs_pm_level spm_lvl;
 	struct device_attribute rpm_lvl_attr;
 	struct device_attribute spm_lvl_attr;
-	int pm_op_in_progress;
+	bool pm_op_in_progress;
+	bool wl_pm_op_in_progress;
 
 	/* Auto-Hibernate Idle Timer register value */
 	u32 ahit;
@@ -839,6 +840,7 @@ struct ufs_hba {
 	struct devfreq *devfreq;
 	struct ufs_clk_scaling clk_scaling;
 	bool is_sys_suspended;
+	bool is_wl_sys_suspended;
 
 	enum bkops_status urgent_bkops_lvl;
 	bool is_urgent_bkops_lvl_checked;
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 2/9] scsi: ufs: Update the return value of supplier pm ops
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
  2021-06-10  4:43 ` [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-10  4:43 ` [PATCH v3 3/9] scsi: ufs: Enable IRQ after enabling clocks in error handling preparation Can Guo
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

rpm_get_suppliers() is returning an error only if the error is negative.
However, ufshcd_wl_resume() may return a positive error code, e.g., when
hibern8 or SSU cmd fails. Make the positive return value a negative error
code so that consumers are aware of any resume failure from their supplier.
Make the same change to ufshcd_wl_suspend() just to keep symmetry.

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 47b2a9a..fed893e 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -8922,7 +8922,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 		ufshcd_release(hba);
 	}
 	hba->wl_pm_op_in_progress = false;
-	return ret;
+	return ret <= 0 ? ret : -EINVAL;
 }
 
 static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
@@ -9009,7 +9009,7 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 	hba->clk_gating.is_suspended = false;
 	ufshcd_release(hba);
 	hba->wl_pm_op_in_progress = false;
-	return ret;
+	return ret <= 0 ? ret : -EINVAL;
 }
 
 static int ufshcd_wl_runtime_suspend(struct device *dev)
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 3/9] scsi: ufs: Enable IRQ after enabling clocks in error handling preparation
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
  2021-06-10  4:43 ` [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops Can Guo
  2021-06-10  4:43 ` [PATCH v3 2/9] scsi: ufs: Update the return value of supplier " Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-10  4:43 ` [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand Can Guo
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

In error handling preparation, enable IRQ after enabling clocks in case
unclocked register access happens.

Fixes: c72e79c0ad2bd ("scsi: ufs: Recover HBA runtime PM error in error handler")
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index fed893e..0c9d2ee 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -5927,13 +5927,14 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 		 * can be OFF or in LPM.
 		 */
 		ufshcd_setup_hba_vreg(hba, true);
-		ufshcd_enable_irq(hba);
 		ufshcd_setup_vreg(hba, true);
 		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq);
 		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq2);
 		ufshcd_hold(hba, false);
-		if (!ufshcd_is_clkgating_allowed(hba))
+		if (!ufshcd_is_clkgating_allowed(hba)) {
 			ufshcd_setup_clocks(hba, true);
+			ufshcd_enable_irq(hba);
+		}
 		ufshcd_release(hba);
 		pm_op = hba->is_wl_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
 		ufshcd_vops_resume(hba, pm_op);
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
                   ` (2 preceding siblings ...)
  2021-06-10  4:43 ` [PATCH v3 3/9] scsi: ufs: Enable IRQ after enabling clocks in error handling preparation Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-11 20:52   ` Bart Van Assche
  2021-06-10  4:43 ` [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation Can Guo
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Commit 7a7e66c65d4148fc3f23b058405bc9f102414fcb ("scsi: ufs: Fix a race
condition between ufshcd_abort() and eh_work()") forgot to complete the
cmd, which takes an occupied lrb, before returning in queuecommand. This
change adds the missing codes.

Fixes: 7a7e66c65d414 ("scsi: ufs: Fix a race condition between ufshcd_abort() and eh_work()")
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 0c9d2ee..7dc0fda 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -2758,6 +2758,16 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		goto out;
 	}
 
+	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
+		if (hba->wl_pm_op_in_progress) {
+			set_host_byte(cmd, DID_BAD_TARGET);
+			cmd->scsi_done(cmd);
+		} else {
+			err = SCSI_MLQUEUE_HOST_BUSY;
+		}
+		goto out;
+	}
+
 	hba->req_abort_count = 0;
 
 	err = ufshcd_hold(hba, true);
@@ -2768,15 +2778,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 	WARN_ON(ufshcd_is_clkgating_allowed(hba) &&
 		(hba->clk_gating.state != CLKS_ON));
 
-	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
-		if (hba->wl_pm_op_in_progress)
-			set_host_byte(cmd, DID_BAD_TARGET);
-		else
-			err = SCSI_MLQUEUE_HOST_BUSY;
-		ufshcd_release(hba);
-		goto out;
-	}
-
 	lrbp = &hba->lrb[tag];
 	WARN_ON(lrbp->cmd);
 	lrbp->cmd = cmd;
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
                   ` (3 preceding siblings ...)
  2021-06-10  4:43 ` [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-10 12:30   ` Adrian Hunter
  2021-06-10  4:43 ` [PATCH v3 6/9] scsi: ufs: Update ufshcd_recover_pm_error() Can Guo
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Commit cb7e6f05fce67c965194ac04467e1ba7bc70b069 ("scsi: ufs: core: Enable
power management for wlun") moves UFS operations out of ufshcd_resume(), so
in error handling preparation, if ufshcd hba has failed to resume, there is
no point to re-enable IRQ/clk/pwr.

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 58 +++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 7dc0fda..0afad6b 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -2727,8 +2727,8 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		break;
 	case UFSHCD_STATE_EH_SCHEDULED_FATAL:
 		/*
-		 * pm_runtime_get_sync() is used at error handling preparation
-		 * stage. If a scsi cmd, e.g. the SSU cmd, is sent from hba's
+		 * ufshcd_rpm_get_sync() is used at error handling preparation
+		 * stage. If a scsi cmd, e.g., the SSU cmd, is sent from the
 		 * PM ops, it can never be finished if we let SCSI layer keep
 		 * retrying it, which gets err handler stuck forever. Neither
 		 * can we let the scsi cmd pass through, because UFS is in bad
@@ -5915,29 +5915,26 @@ static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend)
 	}
 }
 
-static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
+static int ufshcd_err_handling_prepare(struct ufs_hba *hba)
 {
+	/*
+	 * Exclusively call pm_runtime_get_sync(hba->dev) once, in case
+	 * following ufshcd_rpm_get_sync() fails.
+	 */
+	pm_runtime_get_sync(hba->dev);
+	/* End of the world. */
+	if (pm_runtime_suspended(hba->dev)) {
+		pm_runtime_put(hba->dev);
+		return -EINVAL;
+	}
+
+	ufshcd_set_eh_in_progress(hba);
 	ufshcd_rpm_get_sync(hba);
-	if (pm_runtime_status_suspended(&hba->sdev_ufs_device->sdev_gendev) ||
+	if (pm_runtime_suspended(&hba->sdev_ufs_device->sdev_gendev) ||
 	    hba->is_wl_sys_suspended) {
-		enum ufs_pm_op pm_op;
+		enum ufs_pm_op pm_op = hba->is_wl_sys_suspended ?
+				       UFS_SYSTEM_PM : UFS_RUNTIME_PM;
 
-		/*
-		 * Don't assume anything of resume, if
-		 * resume fails, irq and clocks can be OFF, and powers
-		 * can be OFF or in LPM.
-		 */
-		ufshcd_setup_hba_vreg(hba, true);
-		ufshcd_setup_vreg(hba, true);
-		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq);
-		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq2);
-		ufshcd_hold(hba, false);
-		if (!ufshcd_is_clkgating_allowed(hba)) {
-			ufshcd_setup_clocks(hba, true);
-			ufshcd_enable_irq(hba);
-		}
-		ufshcd_release(hba);
-		pm_op = hba->is_wl_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
 		ufshcd_vops_resume(hba, pm_op);
 	} else {
 		ufshcd_hold(hba, false);
@@ -5951,22 +5948,25 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 	down_write(&hba->clk_scaling_lock);
 	up_write(&hba->clk_scaling_lock);
 	cancel_work_sync(&hba->eeh_work);
+	return 0;
 }
 
 static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
 {
+	ufshcd_clear_eh_in_progress(hba);
 	ufshcd_scsi_unblock_requests(hba);
 	ufshcd_release(hba);
 	if (ufshcd_is_clkscaling_supported(hba))
 		ufshcd_clk_scaling_suspend(hba, false);
 	ufshcd_clear_ua_wluns(hba);
 	ufshcd_rpm_put(hba);
+	pm_runtime_put(hba->dev);
 }
 
 static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
 {
 	return (!hba->is_powered || hba->shutting_down ||
-		!hba->sdev_ufs_device ||
+		!hba->sdev_ufs_device || hba->is_sys_suspended ||
 		hba->ufshcd_state == UFSHCD_STATE_ERROR ||
 		(!(hba->saved_err || hba->saved_uic_err || hba->force_reset ||
 		   ufshcd_is_link_broken(hba))));
@@ -6052,9 +6052,13 @@ static void ufshcd_err_handler(struct work_struct *work)
 		up(&hba->host_sem);
 		return;
 	}
-	ufshcd_set_eh_in_progress(hba);
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
-	ufshcd_err_handling_prepare(hba);
+	if (ufshcd_err_handling_prepare(hba)) {
+		dev_err(hba->dev, "%s: error handling preparation failed\n",
+				__func__);
+		up(&hba->host_sem);
+		return;
+	}
 	/* Complete requests that have door-bell cleared by h/w */
 	ufshcd_complete_requests(hba);
 	spin_lock_irqsave(hba->host->host_lock, flags);
@@ -6198,7 +6202,6 @@ static void ufshcd_err_handler(struct work_struct *work)
 			dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x saved_uic_err 0x%x",
 			    __func__, hba->saved_err, hba->saved_uic_err);
 	}
-	ufshcd_clear_eh_in_progress(hba);
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 	ufshcd_err_handling_unprepare(hba);
 	up(&hba->host_sem);
@@ -8999,6 +9002,9 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 
 	/* Enable Auto-Hibernate if configured */
 	ufshcd_auto_hibern8_enable(hba);
+
+	hba->clk_gating.is_suspended = false;
+	ufshcd_release(hba);
 	goto out;
 
 set_old_link_state:
@@ -9008,8 +9014,6 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 out:
 	if (ret)
 		ufshcd_update_evt_hist(hba, UFS_EVT_WL_RES_ERR, (u32)ret);
-	hba->clk_gating.is_suspended = false;
-	ufshcd_release(hba);
 	hba->wl_pm_op_in_progress = false;
 	return ret <= 0 ? ret : -EINVAL;
 }
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 6/9] scsi: ufs: Update ufshcd_recover_pm_error()
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
                   ` (4 preceding siblings ...)
  2021-06-10  4:43 ` [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-10  4:43 ` [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume Can Guo
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

After error handler performs a successful reset and restore, all the LUs
become active, forcibly set the runtime PM status of the scsi devices (and
their request queues) underneath hba to ACTIVE to reflect the change. By
doing so, dev->power.runtime_error (if any) can also be cleared, such that
runtime PM can get back to work on them, otherwise the device(s) may be
left either runtime active or runtime suspended permanently.

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 49 ++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 0afad6b..c418a19 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -243,6 +243,7 @@ static irqreturn_t ufshcd_intr(int irq, void *__hba);
 static int ufshcd_change_power_mode(struct ufs_hba *hba,
 			     struct ufs_pa_layer_attr *pwr_mode);
 static void ufshcd_schedule_eh_work(struct ufs_hba *hba);
+static void ufshcd_recover_pm_error(struct ufs_hba *hba);
 static int ufshcd_setup_hba_vreg(struct ufs_hba *hba, bool on);
 static int ufshcd_setup_vreg(struct ufs_hba *hba, bool on);
 static inline int ufshcd_config_vreg_hpm(struct ufs_hba *hba,
@@ -5951,13 +5952,15 @@ static int ufshcd_err_handling_prepare(struct ufs_hba *hba)
 	return 0;
 }
 
-static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
+static void ufshcd_err_handling_unprepare(struct ufs_hba *hba, int reset_err)
 {
 	ufshcd_clear_eh_in_progress(hba);
 	ufshcd_scsi_unblock_requests(hba);
 	ufshcd_release(hba);
 	if (ufshcd_is_clkscaling_supported(hba))
 		ufshcd_clk_scaling_suspend(hba, false);
+	if (!reset_err)
+		ufshcd_recover_pm_error(hba);
 	ufshcd_clear_ua_wluns(hba);
 	ufshcd_rpm_put(hba);
 	pm_runtime_put(hba->dev);
@@ -5976,34 +5979,26 @@ static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
 static void ufshcd_recover_pm_error(struct ufs_hba *hba)
 {
 	struct Scsi_Host *shost = hba->host;
-	struct scsi_device *sdev;
-	struct request_queue *q;
+	struct scsi_device *sdev = hba->sdev_ufs_device;
+	struct scsi_target *starget = sdev->sdev_target;
 	int ret;
 
 	hba->is_wl_sys_suspended = false;
-	/*
-	 * Set RPM status of wlun device to RPM_ACTIVE,
-	 * this also clears its runtime error.
-	 */
-	ret = pm_runtime_set_active(&hba->sdev_ufs_device->sdev_gendev);
 
-	/* hba device might have a runtime error otherwise */
-	if (ret)
-		ret = pm_runtime_set_active(hba->dev);
-	/*
-	 * If wlun device had runtime error, we also need to resume those
-	 * consumer scsi devices in case any of them has failed to be
-	 * resumed due to supplier runtime resume failure. This is to unblock
-	 * blk_queue_enter in case there are bios waiting inside it.
-	 */
-	if (!ret) {
-		shost_for_each_device(sdev, shost) {
-			q = sdev->request_queue;
-			if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
-				       q->rpm_status == RPM_SUSPENDING))
-				pm_request_resume(q->dev);
-		}
+	/* Resume parent/target to clear path for pm_runtime_set_active() */
+	pm_runtime_get_sync(&starget->dev);
+	shost_for_each_device(sdev, shost) {
+		struct device *dev = &sdev->sdev_gendev;
+
+		pm_runtime_get_sync(dev);
+		/* Clear dev->power.runtime_error */
+		ret = pm_runtime_set_active(dev);
+		if (!ret)
+			/* runtime_error cleared, kick blk_queue_enter() */
+			blk_set_runtime_active(sdev->request_queue);
+		pm_runtime_put(dev);
 	}
+	pm_runtime_put(&starget->dev);
 }
 #else
 static inline void ufshcd_recover_pm_error(struct ufs_hba *hba)
@@ -6037,7 +6032,7 @@ static void ufshcd_err_handler(struct work_struct *work)
 	unsigned long flags;
 	bool err_xfer = false;
 	bool err_tm = false;
-	int err = 0, pmc_err;
+	int err = -1, pmc_err;
 	int tag;
 	bool needs_reset = false, needs_restore = false;
 
@@ -6189,8 +6184,6 @@ static void ufshcd_err_handler(struct work_struct *work)
 		if (err)
 			dev_err(hba->dev, "%s: reset and restore failed with err %d\n",
 					__func__, err);
-		else
-			ufshcd_recover_pm_error(hba);
 		spin_lock_irqsave(hba->host->host_lock, flags);
 	}
 
@@ -6203,7 +6196,7 @@ static void ufshcd_err_handler(struct work_struct *work)
 			    __func__, hba->saved_err, hba->saved_uic_err);
 	}
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
-	ufshcd_err_handling_unprepare(hba);
+	ufshcd_err_handling_unprepare(hba, err);
 	up(&hba->host_sem);
 }
 
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
                   ` (5 preceding siblings ...)
  2021-06-10  4:43 ` [PATCH v3 6/9] scsi: ufs: Update ufshcd_recover_pm_error() Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-10 13:32   ` Adrian Hunter
  2021-06-11 21:00   ` Bart Van Assche
  2021-06-10  4:43 ` [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests Can Guo
  2021-06-10  4:43 ` [PATCH v3 9/9] scsi: ufs: Apply more limitations to user access Can Guo
  8 siblings, 2 replies; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, Bart Van Assche,
	open list

UFS error handling now is doing more than just re-probing, but also sending
scsi cmds, e.g., for clearing UACs, and recovering runtime PM error, which
may change runtime status of scsi devices. To protect system suspend/resume
from being disturbed by error handling, move the host_sem from wl pm ops
to ufshcd_suspend_prepare() and ufshcd_resume_complete().

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 8 +++-----
 drivers/scsi/ufs/ufshcd.h | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index c418a19..861942b 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -9060,16 +9060,13 @@ static int ufshcd_wl_suspend(struct device *dev)
 	ktime_t start = ktime_get();
 
 	hba = shost_priv(sdev->host);
-	down(&hba->host_sem);
 
 	if (pm_runtime_suspended(dev))
 		goto out;
 
 	ret = __ufshcd_wl_suspend(hba, UFS_SYSTEM_PM);
-	if (ret) {
+	if (ret)
 		dev_err(&sdev->sdev_gendev, "%s failed: %d\n", __func__,  ret);
-		up(&hba->host_sem);
-	}
 
 out:
 	if (!ret)
@@ -9102,7 +9099,6 @@ static int ufshcd_wl_resume(struct device *dev)
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
 		hba->is_wl_sys_suspended = false;
-	up(&hba->host_sem);
 	return ret;
 }
 #endif
@@ -9665,6 +9661,7 @@ void ufshcd_resume_complete(struct device *dev)
 		ufshcd_rpmb_rpm_put(hba);
 		hba->rpmb_complete_put = false;
 	}
+	up(&hba->host_sem);
 }
 EXPORT_SYMBOL_GPL(ufshcd_resume_complete);
 
@@ -9691,6 +9688,7 @@ int ufshcd_suspend_prepare(struct device *dev)
 		ufshcd_rpmb_rpm_get_sync(hba);
 		hba->rpmb_complete_put = true;
 	}
+	down(&hba->host_sem);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ufshcd_suspend_prepare);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index eaebb4e..47da47c 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -693,7 +693,7 @@ struct ufs_hba_monitor {
  * @ee_ctrl_mask: Exception event control mask
  * @is_powered: flag to check if HBA is powered
  * @shutting_down: flag to check if shutdown has been invoked
- * @host_sem: semaphore used to serialize concurrent contexts
+ * @host_sem: semaphore used to avoid concurrency of contexts
  * @eh_wq: Workqueue that eh_work works on
  * @eh_work: Worker to handle UFS errors that require s/w attention
  * @eeh_work: Worker to handle exception events
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
                   ` (6 preceding siblings ...)
  2021-06-10  4:43 ` [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-11 21:02   ` Bart Van Assche
  2021-06-10  4:43 ` [PATCH v3 9/9] scsi: ufs: Apply more limitations to user access Can Guo
  8 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

If PM requests fail during runtime suspend/resume, RPM framework saves the
error to dev->power.runtime_error. Before the runtime_error gets cleared,
runtime PM on this specific device won't work again, leaving the device
either runtime active or runtime suspended permanently.

When task abort happens to a PM request sent during runtime suspend/resume,
even if it can be successfully aborted, RPM framework anyways saves the
(TIMEOUT) error. In this situation, we can leverage error handling to
recover and clear the runtime_error. So, let PM requests take the fast
abort path in ufshcd_abort().

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 861942b..cf24ec2 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -2737,7 +2737,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		 * err handler blocked for too long. So, just fail the scsi cmd
 		 * sent from PM ops, err handler can recover PM error anyways.
 		 */
-		if (hba->wl_pm_op_in_progress) {
+		if (cmd->request->rq_flags & RQF_PM) {
 			hba->force_reset = true;
 			set_host_byte(cmd, DID_BAD_TARGET);
 			cmd->scsi_done(cmd);
@@ -2760,7 +2760,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 	}
 
 	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
-		if (hba->wl_pm_op_in_progress) {
+		if (cmd->request->rq_flags & RQF_PM) {
 			set_host_byte(cmd, DID_BAD_TARGET);
 			cmd->scsi_done(cmd);
 		} else {
@@ -6985,11 +6985,14 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 	int err = 0;
 	struct ufshcd_lrb *lrbp;
 	u32 reg;
+	bool need_eh = false;
 
 	host = cmd->device->host;
 	hba = shost_priv(host);
 	tag = cmd->request->tag;
 	lrbp = &hba->lrb[tag];
+
+	dev_info(hba->dev, "%s: Device abort task at tag %d\n", __func__, tag);
 	if (!ufshcd_valid_tag(hba, tag)) {
 		dev_err(hba->dev,
 			"%s: invalid command tag %d: cmd=0x%p, cmd->request=0x%p",
@@ -7007,9 +7010,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		goto out;
 	}
 
-	/* Print Transfer Request of aborted task */
-	dev_info(hba->dev, "%s: Device abort task at tag %d\n", __func__, tag);
-
 	/*
 	 * Print detailed info about aborted request.
 	 * As more than one request might get aborted at the same time,
@@ -7037,21 +7037,21 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 	}
 
 	/*
-	 * Task abort to the device W-LUN is illegal. When this command
-	 * will fail, due to spec violation, scsi err handling next step
-	 * will be to send LU reset which, again, is a spec violation.
-	 * To avoid these unnecessary/illegal steps, first we clean up
-	 * the lrb taken by this cmd and re-set it in outstanding_reqs,
-	 * then queue the eh_work and bail.
+	 * This fast path guarantees the cmd always gets aborted successfully,
+	 * meanwhile it invokes the error handler. It allows contexts, which
+	 * are blocked by this cmd, to fail fast. It serves multiple purposes:
+	 * #1 To avoid unnecessary/illagal abort attempts to the W-LU.
+	 * #2 To avoid live lock between eh_work and specific contexts, i.e.,
+	 *    suspend/resume and eh_work itself.
+	 * #3 To let eh_work recover runtime PM error in case abort happens
+	 *    to cmds sent from runtime suspend/resume ops.
 	 */
-	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
+	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN ||
+	    (cmd->request->rq_flags & RQF_PM)) {
 		ufshcd_update_evt_hist(hba, UFS_EVT_ABORT, lrbp->lun);
 		__ufshcd_transfer_req_compl(hba, (1UL << tag));
 		set_bit(tag, &hba->outstanding_reqs);
-		spin_lock_irqsave(host->host_lock, flags);
-		hba->force_reset = true;
-		ufshcd_schedule_eh_work(hba);
-		spin_unlock_irqrestore(host->host_lock, flags);
+		need_eh = true;
 		goto out;
 	}
 
@@ -7065,6 +7065,12 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 cleanup:
 		__ufshcd_transfer_req_compl(hba, (1UL << tag));
 out:
+		if (cmd->request->rq_flags & RQF_PM || need_eh) {
+			spin_lock_irqsave(host->host_lock, flags);
+			hba->force_reset = true;
+			ufshcd_schedule_eh_work(hba);
+			spin_unlock_irqrestore(host->host_lock, flags);
+		}
 		err = SUCCESS;
 	} else {
 		dev_err(hba->dev, "%s: failed with err %d\n", __func__, err);
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* [PATCH v3 9/9] scsi: ufs: Apply more limitations to user access
       [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
                   ` (7 preceding siblings ...)
  2021-06-10  4:43 ` [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests Can Guo
@ 2021-06-10  4:43 ` Can Guo
  2021-06-11 21:03   ` Bart Van Assche
  8 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-10  4:43 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Adrian Hunter, Bean Huo, Stanley Chu,
	Keoseong Park, Jaegeuk Kim, Bart van Assche, Dinghao Liu,
	Gustavo A. R. Silva, Kiwoong Kim, Satya Tangirala, open list

Do not let user access HW if hba resume fails or hba is not in good state,
otherwise it may lead to various stability issues.

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufs-debugfs.c |  27 ++---------
 drivers/scsi/ufs/ufs-sysfs.c   | 105 ++++++++++++++---------------------------
 drivers/scsi/ufs/ufs_bsg.c     |  16 +++----
 drivers/scsi/ufs/ufshcd.c      |  63 +++++++++++++++----------
 drivers/scsi/ufs/ufshcd.h      |  17 ++++++-
 5 files changed, 101 insertions(+), 127 deletions(-)

diff --git a/drivers/scsi/ufs/ufs-debugfs.c b/drivers/scsi/ufs/ufs-debugfs.c
index 4e1ff20..42c1c8b 100644
--- a/drivers/scsi/ufs/ufs-debugfs.c
+++ b/drivers/scsi/ufs/ufs-debugfs.c
@@ -52,25 +52,6 @@ static int ee_usr_mask_get(void *data, u64 *val)
 	return 0;
 }
 
-static int ufs_debugfs_get_user_access(struct ufs_hba *hba)
-__acquires(&hba->host_sem)
-{
-	down(&hba->host_sem);
-	if (!ufshcd_is_user_access_allowed(hba)) {
-		up(&hba->host_sem);
-		return -EBUSY;
-	}
-	ufshcd_rpm_get_sync(hba);
-	return 0;
-}
-
-static void ufs_debugfs_put_user_access(struct ufs_hba *hba)
-__releases(&hba->host_sem)
-{
-	ufshcd_rpm_put_sync(hba);
-	up(&hba->host_sem);
-}
-
 static int ee_usr_mask_set(void *data, u64 val)
 {
 	struct ufs_hba *hba = data;
@@ -78,11 +59,11 @@ static int ee_usr_mask_set(void *data, u64 val)
 
 	if (val & ~(u64)MASK_EE_STATUS)
 		return -EINVAL;
-	err = ufs_debugfs_get_user_access(hba);
+	err = ufshcd_get_user_access(hba);
 	if (err)
 		return err;
 	err = ufshcd_update_ee_usr_mask(hba, val, MASK_EE_STATUS);
-	ufs_debugfs_put_user_access(hba);
+	ufshcd_put_user_access(hba);
 	return err;
 }
 
@@ -120,10 +101,10 @@ static void ufs_debugfs_restart_ee(struct work_struct *work)
 	struct ufs_hba *hba = container_of(work, struct ufs_hba, debugfs_ee_work.work);
 
 	if (!hba->ee_usr_mask || pm_runtime_suspended(hba->dev) ||
-	    ufs_debugfs_get_user_access(hba))
+	    ufshcd_get_user_access(hba))
 		return;
 	ufshcd_write_ee_control(hba);
-	ufs_debugfs_put_user_access(hba);
+	ufshcd_put_user_access(hba);
 }
 
 void ufs_debugfs_hba_init(struct ufs_hba *hba)
diff --git a/drivers/scsi/ufs/ufs-sysfs.c b/drivers/scsi/ufs/ufs-sysfs.c
index 52bd807..b8732b9 100644
--- a/drivers/scsi/ufs/ufs-sysfs.c
+++ b/drivers/scsi/ufs/ufs-sysfs.c
@@ -160,22 +160,14 @@ static ssize_t auto_hibern8_show(struct device *dev,
 	if (!ufshcd_is_auto_hibern8_supported(hba))
 		return -EOPNOTSUPP;
 
-	down(&hba->host_sem);
-	if (!ufshcd_is_user_access_allowed(hba)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	pm_runtime_get_sync(hba->dev);
+	ret = ufshcd_get_user_access(hba);
+	if (ret)
+		return ret;
 	ufshcd_hold(hba, false);
 	ahit = ufshcd_readl(hba, REG_AUTO_HIBERNATE_IDLE_TIMER);
 	ufshcd_release(hba);
-	pm_runtime_put_sync(hba->dev);
-
 	ret = sysfs_emit(buf, "%d\n", ufshcd_ahit_to_us(ahit));
-
-out:
-	up(&hba->host_sem);
+	ufshcd_put_user_access(hba);
 	return ret;
 }
 
@@ -202,7 +194,7 @@ static ssize_t auto_hibern8_store(struct device *dev,
 		goto out;
 	}
 
-	ufshcd_auto_hibern8_update(hba, ufshcd_us_to_ahit(timer));
+	ret = ufshcd_auto_hibern8_update(hba, ufshcd_us_to_ahit(timer));
 
 out:
 	up(&hba->host_sem);
@@ -239,17 +231,11 @@ static ssize_t wb_on_store(struct device *dev, struct device_attribute *attr,
 	if (wb_enable != 0 && wb_enable != 1)
 		return -EINVAL;
 
-	down(&hba->host_sem);
-	if (!ufshcd_is_user_access_allowed(hba)) {
-		res = -EBUSY;
-		goto out;
-	}
-
-	ufshcd_rpm_get_sync(hba);
+	res = ufshcd_get_user_access(hba);
+	if (res)
+		return res;
 	res = ufshcd_wb_toggle(hba, wb_enable);
-	ufshcd_rpm_put_sync(hba);
-out:
-	up(&hba->host_sem);
+	ufshcd_put_user_access(hba);
 	return res < 0 ? res : count;
 }
 
@@ -527,16 +513,11 @@ static ssize_t ufs_sysfs_read_desc_param(struct ufs_hba *hba,
 	if (param_size > 8)
 		return -EINVAL;
 
-	down(&hba->host_sem);
-	if (!ufshcd_is_user_access_allowed(hba)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ufshcd_rpm_get_sync(hba);
+	ret = ufshcd_get_user_access(hba);
+	if (ret)
+		return ret;
 	ret = ufshcd_read_desc_param(hba, desc_id, desc_index,
 				param_offset, desc_buf, param_size);
-	ufshcd_rpm_put_sync(hba);
 	if (ret) {
 		ret = -EINVAL;
 		goto out;
@@ -561,7 +542,7 @@ static ssize_t ufs_sysfs_read_desc_param(struct ufs_hba *hba,
 	}
 
 out:
-	up(&hba->host_sem);
+	ufshcd_put_user_access(hba);
 	return ret;
 }
 
@@ -904,23 +885,20 @@ static ssize_t _name##_show(struct device *dev,				\
 	int desc_len = QUERY_DESC_MAX_SIZE;				\
 	u8 *desc_buf;							\
 									\
-	down(&hba->host_sem);						\
-	if (!ufshcd_is_user_access_allowed(hba)) {			\
-		up(&hba->host_sem);					\
-		return -EBUSY;						\
-	}								\
+	ret = ufshcd_get_user_access(hba);				\
+	if (ret)							\
+		return ret;						\
 	desc_buf = kzalloc(QUERY_DESC_MAX_SIZE, GFP_ATOMIC);		\
 	if (!desc_buf) {						\
-		up(&hba->host_sem);					\
-		return -ENOMEM;						\
+		ret = -ENOMEM;						\
+		goto out;						\
 	}								\
-	ufshcd_rpm_get_sync(hba);					\
 	ret = ufshcd_query_descriptor_retry(hba,			\
 		UPIU_QUERY_OPCODE_READ_DESC, QUERY_DESC_IDN_DEVICE,	\
 		0, 0, desc_buf, &desc_len);				\
 	if (ret) {							\
 		ret = -EINVAL;						\
-		goto out;						\
+		goto out_free;						\
 	}								\
 	index = desc_buf[DEVICE_DESC_PARAM##_pname];			\
 	kfree(desc_buf);						\
@@ -928,12 +906,12 @@ static ssize_t _name##_show(struct device *dev,				\
 	ret = ufshcd_read_string_desc(hba, index, &desc_buf,		\
 				      SD_ASCII_STD);			\
 	if (ret < 0)							\
-		goto out;						\
+		goto out_free;						\
 	ret = sysfs_emit(buf, "%s\n", desc_buf);			\
-out:									\
-	ufshcd_rpm_put_sync(hba);					\
+out_free:								\
 	kfree(desc_buf);						\
-	up(&hba->host_sem);						\
+out:									\
+	ufshcd_put_user_access(hba);					\
 	return ret;							\
 }									\
 static DEVICE_ATTR_RO(_name)
@@ -973,24 +951,20 @@ static ssize_t _name##_show(struct device *dev,				\
 	int ret;							\
 	struct ufs_hba *hba = dev_get_drvdata(dev);			\
 									\
-	down(&hba->host_sem);						\
-	if (!ufshcd_is_user_access_allowed(hba)) {			\
-		up(&hba->host_sem);					\
-		return -EBUSY;						\
-	}								\
+	ret = ufshcd_get_user_access(hba);				\
+	if (ret)							\
+		return ret;						\
 	if (ufshcd_is_wb_flags(QUERY_FLAG_IDN##_uname))			\
 		index = ufshcd_wb_get_query_index(hba);			\
-	ufshcd_rpm_get_sync(hba);					\
 	ret = ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_READ_FLAG,	\
 		QUERY_FLAG_IDN##_uname, index, &flag);			\
-	ufshcd_rpm_put_sync(hba);					\
 	if (ret) {							\
 		ret = -EINVAL;						\
 		goto out;						\
 	}								\
 	ret = sysfs_emit(buf, "%s\n", flag ? "true" : "false");		\
 out:									\
-	up(&hba->host_sem);						\
+	ufshcd_put_user_access(hba);					\
 	return ret;							\
 }									\
 static DEVICE_ATTR_RO(_name)
@@ -1042,24 +1016,20 @@ static ssize_t _name##_show(struct device *dev,				\
 	int ret;							\
 	u8 index = 0;							\
 									\
-	down(&hba->host_sem);						\
-	if (!ufshcd_is_user_access_allowed(hba)) {			\
-		up(&hba->host_sem);					\
-		return -EBUSY;						\
-	}								\
+	ret = ufshcd_get_user_access(hba);				\
+	if (ret)							\
+		return ret;						\
 	if (ufshcd_is_wb_attrs(QUERY_ATTR_IDN##_uname))			\
 		index = ufshcd_wb_get_query_index(hba);			\
-	ufshcd_rpm_get_sync(hba);					\
 	ret = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR,	\
 		QUERY_ATTR_IDN##_uname, index, 0, &value);		\
-	ufshcd_rpm_put_sync(hba);					\
 	if (ret) {							\
 		ret = -EINVAL;						\
 		goto out;						\
 	}								\
 	ret = sysfs_emit(buf, "0x%08X\n", value);			\
 out:									\
-	up(&hba->host_sem);						\
+	ufshcd_put_user_access(hba);					\
 	return ret;							\
 }									\
 static DEVICE_ATTR_RO(_name)
@@ -1195,16 +1165,11 @@ static ssize_t dyn_cap_needed_attribute_show(struct device *dev,
 	u8 lun = ufshcd_scsi_to_upiu_lun(sdev->lun);
 	int ret;
 
-	down(&hba->host_sem);
-	if (!ufshcd_is_user_access_allowed(hba)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ufshcd_rpm_get_sync(hba);
+	ret = ufshcd_get_user_access(hba);
+	if (ret)
+		return ret;
 	ret = ufshcd_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR,
 		QUERY_ATTR_IDN_DYN_CAP_NEEDED, lun, 0, &value);
-	ufshcd_rpm_put_sync(hba);
 	if (ret) {
 		ret = -EINVAL;
 		goto out;
@@ -1213,7 +1178,7 @@ static ssize_t dyn_cap_needed_attribute_show(struct device *dev,
 	ret = sysfs_emit(buf, "0x%08X\n", value);
 
 out:
-	up(&hba->host_sem);
+	ufshcd_put_user_access(hba);
 	return ret;
 }
 static DEVICE_ATTR_RO(dyn_cap_needed_attribute);
diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c
index 39bf204..c5b3eb8 100644
--- a/drivers/scsi/ufs/ufs_bsg.c
+++ b/drivers/scsi/ufs/ufs_bsg.c
@@ -97,7 +97,9 @@ static int ufs_bsg_request(struct bsg_job *job)
 
 	bsg_reply->reply_payload_rcv_len = 0;
 
-	ufshcd_rpm_get_sync(hba);
+	ret = ufshcd_get_user_access(hba);
+	if (ret)
+		goto out;
 
 	msgcode = bsg_request->msgcode;
 	switch (msgcode) {
@@ -105,10 +107,8 @@ static int ufs_bsg_request(struct bsg_job *job)
 		desc_op = bsg_request->upiu_req.qr.opcode;
 		ret = ufs_bsg_alloc_desc_buffer(hba, job, &desc_buff,
 						&desc_len, desc_op);
-		if (ret) {
-			ufshcd_rpm_put_sync(hba);
-			goto out;
-		}
+		if (ret)
+			goto out_put_access;
 
 		fallthrough;
 	case UPIU_TRANSACTION_NOP_OUT:
@@ -138,10 +138,8 @@ static int ufs_bsg_request(struct bsg_job *job)
 		break;
 	}
 
-	ufshcd_rpm_put_sync(hba);
-
 	if (!desc_buff)
-		goto out;
+		goto out_put_access;
 
 	if (desc_op == UPIU_QUERY_OPCODE_READ_DESC && desc_len)
 		bsg_reply->reply_payload_rcv_len =
@@ -151,6 +149,8 @@ static int ufs_bsg_request(struct bsg_job *job)
 
 	kfree(desc_buff);
 
+out_put_access:
+	ufshcd_put_user_access(hba);
 out:
 	bsg_reply->result = ret;
 	job->reply_len = sizeof(struct ufs_bsg_reply);
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index cf24ec2..5ec829c 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -128,15 +128,6 @@ enum {
 	UFSHCD_CAN_QUEUE	= 32,
 };
 
-/* UFSHCD states */
-enum {
-	UFSHCD_STATE_RESET,
-	UFSHCD_STATE_ERROR,
-	UFSHCD_STATE_OPERATIONAL,
-	UFSHCD_STATE_EH_SCHEDULED_FATAL,
-	UFSHCD_STATE_EH_SCHEDULED_NON_FATAL,
-};
-
 /* UFSHCD error handling flags */
 enum {
 	UFSHCD_EH_IN_PROGRESS = (1 << 0),
@@ -254,6 +245,31 @@ static inline void ufshcd_wb_toggle_flush(struct ufs_hba *hba, bool enable);
 static void ufshcd_hba_vreg_set_lpm(struct ufs_hba *hba);
 static void ufshcd_hba_vreg_set_hpm(struct ufs_hba *hba);
 
+int ufshcd_get_user_access(struct ufs_hba *hba)
+__acquires(&hba->host_sem)
+{
+	down(&hba->host_sem);
+	if (!ufshcd_is_user_access_allowed(hba)) {
+		up(&hba->host_sem);
+		return -EBUSY;
+	}
+	if (ufshcd_rpm_get_sync(hba)) {
+		ufshcd_rpm_put_sync(hba);
+		up(&hba->host_sem);
+		return -EBUSY;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ufshcd_get_user_access);
+
+void ufshcd_put_user_access(struct ufs_hba *hba)
+__releases(&hba->host_sem)
+{
+	ufshcd_rpm_put_sync(hba);
+	up(&hba->host_sem);
+}
+EXPORT_SYMBOL_GPL(ufshcd_put_user_access);
+
 static inline bool ufshcd_valid_tag(struct ufs_hba *hba, int tag)
 {
 	return tag >= 0 && tag < hba->nutrs;
@@ -1553,19 +1569,14 @@ static ssize_t ufshcd_clkscale_enable_store(struct device *dev,
 	if (kstrtou32(buf, 0, &value))
 		return -EINVAL;
 
-	down(&hba->host_sem);
-	if (!ufshcd_is_user_access_allowed(hba)) {
-		err = -EBUSY;
-		goto out;
-	}
+	err = ufshcd_get_user_access(hba);
+	if (err)
+		return err;
+	ufshcd_hold(hba, false);
 
 	value = !!value;
 	if (value == hba->clk_scaling.is_enabled)
 		goto out;
-
-	ufshcd_rpm_get_sync(hba);
-	ufshcd_hold(hba, false);
-
 	hba->clk_scaling.is_enabled = value;
 
 	if (value) {
@@ -1578,10 +1589,9 @@ static ssize_t ufshcd_clkscale_enable_store(struct device *dev,
 					__func__, err);
 	}
 
-	ufshcd_release(hba);
-	ufshcd_rpm_put_sync(hba);
 out:
-	up(&hba->host_sem);
+	ufshcd_release(hba);
+	ufshcd_put_user_access(hba);
 	return err ? err : count;
 }
 
@@ -4180,13 +4190,13 @@ int ufshcd_uic_hibern8_exit(struct ufs_hba *hba)
 }
 EXPORT_SYMBOL_GPL(ufshcd_uic_hibern8_exit);
 
-void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
+int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
 {
 	unsigned long flags;
 	bool update = false;
 
 	if (!ufshcd_is_auto_hibern8_supported(hba))
-		return;
+		return 0;
 
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	if (hba->ahit != ahit) {
@@ -4197,12 +4207,17 @@ void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit)
 
 	if (update &&
 	    !pm_runtime_suspended(&hba->sdev_ufs_device->sdev_gendev)) {
-		ufshcd_rpm_get_sync(hba);
+		if (ufshcd_rpm_get_sync(hba)) {
+			ufshcd_rpm_put_sync(hba);
+			return -EBUSY;
+		}
 		ufshcd_hold(hba, false);
 		ufshcd_auto_hibern8_enable(hba);
 		ufshcd_release(hba);
 		ufshcd_rpm_put_sync(hba);
 	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ufshcd_auto_hibern8_update);
 
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 47da47c..5cd1484 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -101,6 +101,15 @@ struct uic_command {
 	struct completion done;
 };
 
+/* UFSHCD states */
+enum {
+	UFSHCD_STATE_RESET,
+	UFSHCD_STATE_ERROR,
+	UFSHCD_STATE_OPERATIONAL,
+	UFSHCD_STATE_EH_SCHEDULED_FATAL,
+	UFSHCD_STATE_EH_SCHEDULED_NON_FATAL,
+};
+
 /* Used to differentiate the power management options */
 enum ufs_pm_op {
 	UFS_RUNTIME_PM,
@@ -931,7 +940,9 @@ static inline bool ufshcd_is_wb_allowed(struct ufs_hba *hba)
 
 static inline bool ufshcd_is_user_access_allowed(struct ufs_hba *hba)
 {
-	return !hba->shutting_down;
+	return !hba->shutting_down && !hba->is_sys_suspended &&
+		!hba->is_wl_sys_suspended &&
+		hba->ufshcd_state == UFSHCD_STATE_OPERATIONAL;
 }
 
 #define ufshcd_writel(hba, val, reg)	\
@@ -1104,7 +1115,7 @@ int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode,
 	enum flag_idn idn, u8 index, bool *flag_res);
 
 void ufshcd_auto_hibern8_enable(struct ufs_hba *hba);
-void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit);
+int ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit);
 void ufshcd_fixup_dev_quirks(struct ufs_hba *hba, struct ufs_dev_fix *fixups);
 #define SD_ASCII_STD true
 #define SD_RAW false
@@ -1131,6 +1142,8 @@ int ufshcd_exec_raw_upiu_cmd(struct ufs_hba *hba,
 int ufshcd_wb_toggle(struct ufs_hba *hba, bool enable);
 int ufshcd_suspend_prepare(struct device *dev);
 void ufshcd_resume_complete(struct device *dev);
+int ufshcd_get_user_access(struct ufs_hba *hba);
+void ufshcd_put_user_access(struct ufs_hba *hba);
 
 /* Wrapper functions for safely calling variant operations */
 static inline const char *ufshcd_get_var_name(struct ufs_hba *hba)
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops
  2021-06-10  4:43 ` [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops Can Guo
@ 2021-06-10 11:15   ` Adrian Hunter
  2021-06-11  0:53     ` Can Guo
  2021-06-11 20:40   ` Bart Van Assche
  2021-06-16 17:50   ` Bart Van Assche
  2 siblings, 1 reply; 43+ messages in thread
From: Adrian Hunter @ 2021-06-10 11:15 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Kiwoong Kim, Satya Tangirala, Bart Van Assche, open list

On 10/06/21 7:43 am, Can Guo wrote:
> Put pm_op_in_progress and is_sys_suspend flags back to ufshcd hba pm ops,
> add two new flags, namely wl_pm_op_in_progress and is_wl_sys_suspended, to
> track the UFS device W-LU pm ops. This helps us differentiate the status of
> hba and wl pm ops when we need to do troubleshooting.

Really you have 2 changes here:
1. Renaming to pm_op_in_progress / is_sys_suspend to wl_pm_op_in_progress / is_wl_sys_suspended
2. Introducing flags for the status of hba

So it should really be 2 patches.

That would show up things like:
- did you intend not to change hba->is_sys_suspended in ufs_qcom_resume() ?

> 
> Signed-off-by: Can Guo <cang@codeaurora.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 42 ++++++++++++++++++++++++++++--------------
>  drivers/scsi/ufs/ufshcd.h |  4 +++-
>  2 files changed, 31 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 25fe18a..47b2a9a 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -549,7 +549,9 @@ static void ufshcd_print_host_state(struct ufs_hba *hba)
>  		hba->saved_err, hba->saved_uic_err);
>  	dev_err(hba->dev, "Device power mode=%d, UIC link state=%d\n",
>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
> -	dev_err(hba->dev, "PM in progress=%d, sys. suspended=%d\n",
> +	dev_err(hba->dev, "wl_pm_op_in_progress=%d, is_wl_sys_suspended=%d\n",
> +		hba->wl_pm_op_in_progress, hba->is_wl_sys_suspended);
> +	dev_err(hba->dev, "pm_op_in_progress=%d, is_sys_suspended=%d\n",
>  		hba->pm_op_in_progress, hba->is_sys_suspended);
>  	dev_err(hba->dev, "Auto BKOPS=%d, Host self-block=%d\n",
>  		hba->auto_bkops_enabled, hba->host->host_self_blocked);
> @@ -1999,7 +2001,7 @@ static void ufshcd_clk_scaling_start_busy(struct ufs_hba *hba)
>  	if (!hba->clk_scaling.active_reqs++)
>  		queue_resume_work = true;
>  
> -	if (!hba->clk_scaling.is_enabled || hba->pm_op_in_progress) {
> +	if (!hba->clk_scaling.is_enabled || hba->wl_pm_op_in_progress) {
>  		spin_unlock_irqrestore(hba->host->host_lock, flags);
>  		return;
>  	}
> @@ -2734,7 +2736,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>  		 * err handler blocked for too long. So, just fail the scsi cmd
>  		 * sent from PM ops, err handler can recover PM error anyways.
>  		 */
> -		if (hba->pm_op_in_progress) {
> +		if (hba->wl_pm_op_in_progress) {
>  			hba->force_reset = true;
>  			set_host_byte(cmd, DID_BAD_TARGET);
>  			cmd->scsi_done(cmd);
> @@ -2767,7 +2769,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>  		(hba->clk_gating.state != CLKS_ON));
>  
>  	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
> -		if (hba->pm_op_in_progress)
> +		if (hba->wl_pm_op_in_progress)
>  			set_host_byte(cmd, DID_BAD_TARGET);
>  		else
>  			err = SCSI_MLQUEUE_HOST_BUSY;
> @@ -5116,7 +5118,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
>  			 * solution could be to abort the system suspend if
>  			 * UFS device needs urgent BKOPs.
>  			 */
> -			if (!hba->pm_op_in_progress &&
> +			if (!hba->wl_pm_op_in_progress &&
>  			    !ufshcd_eh_in_progress(hba) &&
>  			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr))
>  				/* Flushed in suspend */
> @@ -5916,7 +5918,7 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>  {
>  	ufshcd_rpm_get_sync(hba);
>  	if (pm_runtime_status_suspended(&hba->sdev_ufs_device->sdev_gendev) ||
> -	    hba->is_sys_suspended) {
> +	    hba->is_wl_sys_suspended) {
>  		enum ufs_pm_op pm_op;
>  
>  		/*
> @@ -5933,7 +5935,7 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>  		if (!ufshcd_is_clkgating_allowed(hba))
>  			ufshcd_setup_clocks(hba, true);
>  		ufshcd_release(hba);
> -		pm_op = hba->is_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
> +		pm_op = hba->is_wl_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
>  		ufshcd_vops_resume(hba, pm_op);
>  	} else {
>  		ufshcd_hold(hba, false);
> @@ -5976,7 +5978,7 @@ static void ufshcd_recover_pm_error(struct ufs_hba *hba)
>  	struct request_queue *q;
>  	int ret;
>  
> -	hba->is_sys_suspended = false;
> +	hba->is_wl_sys_suspended = false;
>  	/*
>  	 * Set RPM status of wlun device to RPM_ACTIVE,
>  	 * this also clears its runtime error.
> @@ -8784,7 +8786,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  	enum ufs_dev_pwr_mode req_dev_pwr_mode;
>  	enum uic_link_state req_link_state;
>  
> -	hba->pm_op_in_progress = true;
> +	hba->wl_pm_op_in_progress = true;
>  	if (pm_op != UFS_SHUTDOWN_PM) {
>  		pm_lvl = pm_op == UFS_RUNTIME_PM ?
>  			 hba->rpm_lvl : hba->spm_lvl;
> @@ -8919,7 +8921,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  		hba->clk_gating.is_suspended = false;
>  		ufshcd_release(hba);
>  	}
> -	hba->pm_op_in_progress = false;
> +	hba->wl_pm_op_in_progress = false;
>  	return ret;
>  }
>  
> @@ -8928,7 +8930,7 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  	int ret;
>  	enum uic_link_state old_link_state = hba->uic_link_state;
>  
> -	hba->pm_op_in_progress = true;
> +	hba->wl_pm_op_in_progress = true;
>  
>  	/*
>  	 * Call vendor specific resume callback. As these callbacks may access
> @@ -9006,7 +9008,7 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  		ufshcd_update_evt_hist(hba, UFS_EVT_WL_RES_ERR, (u32)ret);
>  	hba->clk_gating.is_suspended = false;
>  	ufshcd_release(hba);
> -	hba->pm_op_in_progress = false;
> +	hba->wl_pm_op_in_progress = false;
>  	return ret;
>  }
>  
> @@ -9072,7 +9074,7 @@ static int ufshcd_wl_suspend(struct device *dev)
>  
>  out:
>  	if (!ret)
> -		hba->is_sys_suspended = true;
> +		hba->is_wl_sys_suspended = true;
>  	trace_ufshcd_wl_suspend(dev_name(dev), ret,
>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
> @@ -9100,7 +9102,7 @@ static int ufshcd_wl_resume(struct device *dev)
>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>  	if (!ret)
> -		hba->is_sys_suspended = false;
> +		hba->is_wl_sys_suspended = false;
>  	up(&hba->host_sem);
>  	return ret;
>  }
> @@ -9141,6 +9143,8 @@ static int ufshcd_suspend(struct ufs_hba *hba)
>  
>  	if (!hba->is_powered)
>  		return 0;
> +
> +	hba->pm_op_in_progress = true;
>  	/*
>  	 * Disable the host irq as host controller as there won't be any
>  	 * host controller transaction expected till resume.
> @@ -9160,6 +9164,7 @@ static int ufshcd_suspend(struct ufs_hba *hba)
>  	ufshcd_vreg_set_lpm(hba);
>  	/* Put the host controller in low power mode if possible */
>  	ufshcd_hba_vreg_set_lpm(hba);
> +	hba->pm_op_in_progress = false;
>  	return ret;
>  }
>  
> @@ -9179,6 +9184,7 @@ static int ufshcd_resume(struct ufs_hba *hba)
>  	if (!hba->is_powered)
>  		return 0;
>  
> +	hba->pm_op_in_progress = true;
>  	ufshcd_hba_vreg_set_hpm(hba);
>  	ret = ufshcd_vreg_set_hpm(hba);
>  	if (ret)
> @@ -9198,6 +9204,7 @@ static int ufshcd_resume(struct ufs_hba *hba)
>  out:
>  	if (ret)
>  		ufshcd_update_evt_hist(hba, UFS_EVT_RESUME_ERR, (u32)ret);
> +	hba->pm_op_in_progress = false;
>  	return ret;
>  }
>  
> @@ -9222,6 +9229,10 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
>  	trace_ufshcd_system_suspend(dev_name(hba->dev), ret,
>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
> +
> +	if (!ret)
> +		hba->is_sys_suspended = true;
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL(ufshcd_system_suspend);
> @@ -9248,6 +9259,9 @@ int ufshcd_system_resume(struct ufs_hba *hba)
>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>  
> +	if (!ret)
> +		hba->is_sys_suspended = false;
> +
>  	return ret;
>  }
>  EXPORT_SYMBOL(ufshcd_system_resume);
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index c98d540..eaebb4e 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -752,7 +752,8 @@ struct ufs_hba {
>  	enum ufs_pm_level spm_lvl;
>  	struct device_attribute rpm_lvl_attr;
>  	struct device_attribute spm_lvl_attr;
> -	int pm_op_in_progress;
> +	bool pm_op_in_progress;
> +	bool wl_pm_op_in_progress;
>  
>  	/* Auto-Hibernate Idle Timer register value */
>  	u32 ahit;
> @@ -839,6 +840,7 @@ struct ufs_hba {
>  	struct devfreq *devfreq;
>  	struct ufs_clk_scaling clk_scaling;
>  	bool is_sys_suspended;
> +	bool is_wl_sys_suspended;
>  
>  	enum bkops_status urgent_bkops_lvl;
>  	bool is_urgent_bkops_lvl_checked;
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation
  2021-06-10  4:43 ` [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation Can Guo
@ 2021-06-10 12:30   ` Adrian Hunter
  2021-06-11  3:01     ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Adrian Hunter @ 2021-06-10 12:30 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 10/06/21 7:43 am, Can Guo wrote:
> Commit cb7e6f05fce67c965194ac04467e1ba7bc70b069 ("scsi: ufs: core: Enable
> power management for wlun") moves UFS operations out of ufshcd_resume(), so
> in error handling preparation, if ufshcd hba has failed to resume, there is
> no point to re-enable IRQ/clk/pwr.

I am not sure how cb7e6f05fce67c965194ac04467e1ba7bc70b069 made things any
different, but what I really wonder is why we don't just do recovery
directly in __ufshcd_wl_suspend() and  __ufshcd_wl_resume() and strip all
the PM complexity out of ufshcd_err_handling()?

> 
> Signed-off-by: Can Guo <cang@codeaurora.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 58 +++++++++++++++++++++++++----------------------
>  1 file changed, 31 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 7dc0fda..0afad6b 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -2727,8 +2727,8 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>  		break;
>  	case UFSHCD_STATE_EH_SCHEDULED_FATAL:
>  		/*
> -		 * pm_runtime_get_sync() is used at error handling preparation
> -		 * stage. If a scsi cmd, e.g. the SSU cmd, is sent from hba's
> +		 * ufshcd_rpm_get_sync() is used at error handling preparation
> +		 * stage. If a scsi cmd, e.g., the SSU cmd, is sent from the
>  		 * PM ops, it can never be finished if we let SCSI layer keep
>  		 * retrying it, which gets err handler stuck forever. Neither
>  		 * can we let the scsi cmd pass through, because UFS is in bad
> @@ -5915,29 +5915,26 @@ static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend)
>  	}
>  }
>  
> -static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
> +static int ufshcd_err_handling_prepare(struct ufs_hba *hba)
>  {
> +	/*
> +	 * Exclusively call pm_runtime_get_sync(hba->dev) once, in case
> +	 * following ufshcd_rpm_get_sync() fails.
> +	 */
> +	pm_runtime_get_sync(hba->dev);
> +	/* End of the world. */
> +	if (pm_runtime_suspended(hba->dev)) {
> +		pm_runtime_put(hba->dev);
> +		return -EINVAL;
> +	}
> +
> +	ufshcd_set_eh_in_progress(hba);
>  	ufshcd_rpm_get_sync(hba);
> -	if (pm_runtime_status_suspended(&hba->sdev_ufs_device->sdev_gendev) ||
> +	if (pm_runtime_suspended(&hba->sdev_ufs_device->sdev_gendev) ||
>  	    hba->is_wl_sys_suspended) {
> -		enum ufs_pm_op pm_op;
> +		enum ufs_pm_op pm_op = hba->is_wl_sys_suspended ?
> +				       UFS_SYSTEM_PM : UFS_RUNTIME_PM;
>  
> -		/*
> -		 * Don't assume anything of resume, if
> -		 * resume fails, irq and clocks can be OFF, and powers
> -		 * can be OFF or in LPM.
> -		 */
> -		ufshcd_setup_hba_vreg(hba, true);
> -		ufshcd_setup_vreg(hba, true);
> -		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq);
> -		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq2);
> -		ufshcd_hold(hba, false);
> -		if (!ufshcd_is_clkgating_allowed(hba)) {
> -			ufshcd_setup_clocks(hba, true);
> -			ufshcd_enable_irq(hba);
> -		}
> -		ufshcd_release(hba);
> -		pm_op = hba->is_wl_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
>  		ufshcd_vops_resume(hba, pm_op);
>  	} else {
>  		ufshcd_hold(hba, false);
> @@ -5951,22 +5948,25 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>  	down_write(&hba->clk_scaling_lock);
>  	up_write(&hba->clk_scaling_lock);
>  	cancel_work_sync(&hba->eeh_work);
> +	return 0;
>  }
>  
>  static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
>  {
> +	ufshcd_clear_eh_in_progress(hba);
>  	ufshcd_scsi_unblock_requests(hba);
>  	ufshcd_release(hba);
>  	if (ufshcd_is_clkscaling_supported(hba))
>  		ufshcd_clk_scaling_suspend(hba, false);
>  	ufshcd_clear_ua_wluns(hba);
>  	ufshcd_rpm_put(hba);
> +	pm_runtime_put(hba->dev);
>  }
>  
>  static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
>  {
>  	return (!hba->is_powered || hba->shutting_down ||
> -		!hba->sdev_ufs_device ||
> +		!hba->sdev_ufs_device || hba->is_sys_suspended ||
>  		hba->ufshcd_state == UFSHCD_STATE_ERROR ||
>  		(!(hba->saved_err || hba->saved_uic_err || hba->force_reset ||
>  		   ufshcd_is_link_broken(hba))));
> @@ -6052,9 +6052,13 @@ static void ufshcd_err_handler(struct work_struct *work)
>  		up(&hba->host_sem);
>  		return;
>  	}
> -	ufshcd_set_eh_in_progress(hba);
>  	spin_unlock_irqrestore(hba->host->host_lock, flags);
> -	ufshcd_err_handling_prepare(hba);
> +	if (ufshcd_err_handling_prepare(hba)) {
> +		dev_err(hba->dev, "%s: error handling preparation failed\n",
> +				__func__);
> +		up(&hba->host_sem);
> +		return;
> +	}
>  	/* Complete requests that have door-bell cleared by h/w */
>  	ufshcd_complete_requests(hba);
>  	spin_lock_irqsave(hba->host->host_lock, flags);
> @@ -6198,7 +6202,6 @@ static void ufshcd_err_handler(struct work_struct *work)
>  			dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x saved_uic_err 0x%x",
>  			    __func__, hba->saved_err, hba->saved_uic_err);
>  	}
> -	ufshcd_clear_eh_in_progress(hba);
>  	spin_unlock_irqrestore(hba->host->host_lock, flags);
>  	ufshcd_err_handling_unprepare(hba);
>  	up(&hba->host_sem);
> @@ -8999,6 +9002,9 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  
>  	/* Enable Auto-Hibernate if configured */
>  	ufshcd_auto_hibern8_enable(hba);
> +
> +	hba->clk_gating.is_suspended = false;
> +	ufshcd_release(hba);
>  	goto out;
>  
>  set_old_link_state:
> @@ -9008,8 +9014,6 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  out:
>  	if (ret)
>  		ufshcd_update_evt_hist(hba, UFS_EVT_WL_RES_ERR, (u32)ret);
> -	hba->clk_gating.is_suspended = false;
> -	ufshcd_release(hba);
>  	hba->wl_pm_op_in_progress = false;
>  	return ret <= 0 ? ret : -EINVAL;
>  }
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume
  2021-06-10  4:43 ` [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume Can Guo
@ 2021-06-10 13:32   ` Adrian Hunter
  2021-06-11  3:06     ` Can Guo
  2021-06-11 21:00   ` Bart Van Assche
  1 sibling, 1 reply; 43+ messages in thread
From: Adrian Hunter @ 2021-06-10 13:32 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Kiwoong Kim, Satya Tangirala, Bart Van Assche, open list

On 10/06/21 7:43 am, Can Guo wrote:
> UFS error handling now is doing more than just re-probing, but also sending
> scsi cmds, e.g., for clearing UACs, and recovering runtime PM error, which
> may change runtime status of scsi devices. To protect system suspend/resume
> from being disturbed by error handling, move the host_sem from wl pm ops
> to ufshcd_suspend_prepare() and ufshcd_resume_complete().

Have you checked whether error handling might actually be needed after
ufshcd_suspend_prepare()?

Wouldn't this complexity go away if we just did recovery
directly in __ufshcd_wl_suspend() and  __ufshcd_wl_resume()?

> 
> Signed-off-by: Can Guo <cang@codeaurora.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 8 +++-----
>  drivers/scsi/ufs/ufshcd.h | 2 +-
>  2 files changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index c418a19..861942b 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -9060,16 +9060,13 @@ static int ufshcd_wl_suspend(struct device *dev)
>  	ktime_t start = ktime_get();
>  
>  	hba = shost_priv(sdev->host);
> -	down(&hba->host_sem);
>  
>  	if (pm_runtime_suspended(dev))
>  		goto out;
>  
>  	ret = __ufshcd_wl_suspend(hba, UFS_SYSTEM_PM);
> -	if (ret) {
> +	if (ret)
>  		dev_err(&sdev->sdev_gendev, "%s failed: %d\n", __func__,  ret);
> -		up(&hba->host_sem);
> -	}
>  
>  out:
>  	if (!ret)
> @@ -9102,7 +9099,6 @@ static int ufshcd_wl_resume(struct device *dev)
>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>  	if (!ret)
>  		hba->is_wl_sys_suspended = false;
> -	up(&hba->host_sem);
>  	return ret;
>  }
>  #endif
> @@ -9665,6 +9661,7 @@ void ufshcd_resume_complete(struct device *dev)
>  		ufshcd_rpmb_rpm_put(hba);
>  		hba->rpmb_complete_put = false;
>  	}
> +	up(&hba->host_sem);
>  }
>  EXPORT_SYMBOL_GPL(ufshcd_resume_complete);
>  
> @@ -9691,6 +9688,7 @@ int ufshcd_suspend_prepare(struct device *dev)
>  		ufshcd_rpmb_rpm_get_sync(hba);
>  		hba->rpmb_complete_put = true;
>  	}
> +	down(&hba->host_sem);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(ufshcd_suspend_prepare);
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index eaebb4e..47da47c 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -693,7 +693,7 @@ struct ufs_hba_monitor {
>   * @ee_ctrl_mask: Exception event control mask
>   * @is_powered: flag to check if HBA is powered
>   * @shutting_down: flag to check if shutdown has been invoked
> - * @host_sem: semaphore used to serialize concurrent contexts
> + * @host_sem: semaphore used to avoid concurrency of contexts
>   * @eh_wq: Workqueue that eh_work works on
>   * @eh_work: Worker to handle UFS errors that require s/w attention
>   * @eeh_work: Worker to handle exception events
> 


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops
  2021-06-10 11:15   ` Adrian Hunter
@ 2021-06-11  0:53     ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-11  0:53 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Kiwoong Kim, Satya Tangirala, Bart Van Assche, open list

Hi Adrian,

On 2021-06-10 19:15, Adrian Hunter wrote:
> On 10/06/21 7:43 am, Can Guo wrote:
>> Put pm_op_in_progress and is_sys_suspend flags back to ufshcd hba pm 
>> ops,
>> add two new flags, namely wl_pm_op_in_progress and 
>> is_wl_sys_suspended, to
>> track the UFS device W-LU pm ops. This helps us differentiate the 
>> status of
>> hba and wl pm ops when we need to do troubleshooting.
> 
> Really you have 2 changes here:
> 1. Renaming to pm_op_in_progress / is_sys_suspend to
> wl_pm_op_in_progress / is_wl_sys_suspended
> 2. Introducing flags for the status of hba
> 
> So it should really be 2 patches.

Sure I will make it 2 in next version.

> 
> That would show up things like:
> - did you intend not to change hba->is_sys_suspended in 
> ufs_qcom_resume() ?

I missed it - shall change it in next version.

Thanks,
Can Guo.

> 
>> 
>> Signed-off-by: Can Guo <cang@codeaurora.org>
>> ---
>>  drivers/scsi/ufs/ufshcd.c | 42 
>> ++++++++++++++++++++++++++++--------------
>>  drivers/scsi/ufs/ufshcd.h |  4 +++-
>>  2 files changed, 31 insertions(+), 15 deletions(-)
>> 
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index 25fe18a..47b2a9a 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -549,7 +549,9 @@ static void ufshcd_print_host_state(struct ufs_hba 
>> *hba)
>>  		hba->saved_err, hba->saved_uic_err);
>>  	dev_err(hba->dev, "Device power mode=%d, UIC link state=%d\n",
>>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>> -	dev_err(hba->dev, "PM in progress=%d, sys. suspended=%d\n",
>> +	dev_err(hba->dev, "wl_pm_op_in_progress=%d, 
>> is_wl_sys_suspended=%d\n",
>> +		hba->wl_pm_op_in_progress, hba->is_wl_sys_suspended);
>> +	dev_err(hba->dev, "pm_op_in_progress=%d, is_sys_suspended=%d\n",
>>  		hba->pm_op_in_progress, hba->is_sys_suspended);
>>  	dev_err(hba->dev, "Auto BKOPS=%d, Host self-block=%d\n",
>>  		hba->auto_bkops_enabled, hba->host->host_self_blocked);
>> @@ -1999,7 +2001,7 @@ static void ufshcd_clk_scaling_start_busy(struct 
>> ufs_hba *hba)
>>  	if (!hba->clk_scaling.active_reqs++)
>>  		queue_resume_work = true;
>> 
>> -	if (!hba->clk_scaling.is_enabled || hba->pm_op_in_progress) {
>> +	if (!hba->clk_scaling.is_enabled || hba->wl_pm_op_in_progress) {
>>  		spin_unlock_irqrestore(hba->host->host_lock, flags);
>>  		return;
>>  	}
>> @@ -2734,7 +2736,7 @@ static int ufshcd_queuecommand(struct Scsi_Host 
>> *host, struct scsi_cmnd *cmd)
>>  		 * err handler blocked for too long. So, just fail the scsi cmd
>>  		 * sent from PM ops, err handler can recover PM error anyways.
>>  		 */
>> -		if (hba->pm_op_in_progress) {
>> +		if (hba->wl_pm_op_in_progress) {
>>  			hba->force_reset = true;
>>  			set_host_byte(cmd, DID_BAD_TARGET);
>>  			cmd->scsi_done(cmd);
>> @@ -2767,7 +2769,7 @@ static int ufshcd_queuecommand(struct Scsi_Host 
>> *host, struct scsi_cmnd *cmd)
>>  		(hba->clk_gating.state != CLKS_ON));
>> 
>>  	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
>> -		if (hba->pm_op_in_progress)
>> +		if (hba->wl_pm_op_in_progress)
>>  			set_host_byte(cmd, DID_BAD_TARGET);
>>  		else
>>  			err = SCSI_MLQUEUE_HOST_BUSY;
>> @@ -5116,7 +5118,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, 
>> struct ufshcd_lrb *lrbp)
>>  			 * solution could be to abort the system suspend if
>>  			 * UFS device needs urgent BKOPs.
>>  			 */
>> -			if (!hba->pm_op_in_progress &&
>> +			if (!hba->wl_pm_op_in_progress &&
>>  			    !ufshcd_eh_in_progress(hba) &&
>>  			    ufshcd_is_exception_event(lrbp->ucd_rsp_ptr))
>>  				/* Flushed in suspend */
>> @@ -5916,7 +5918,7 @@ static void ufshcd_err_handling_prepare(struct 
>> ufs_hba *hba)
>>  {
>>  	ufshcd_rpm_get_sync(hba);
>>  	if (pm_runtime_status_suspended(&hba->sdev_ufs_device->sdev_gendev) 
>> ||
>> -	    hba->is_sys_suspended) {
>> +	    hba->is_wl_sys_suspended) {
>>  		enum ufs_pm_op pm_op;
>> 
>>  		/*
>> @@ -5933,7 +5935,7 @@ static void ufshcd_err_handling_prepare(struct 
>> ufs_hba *hba)
>>  		if (!ufshcd_is_clkgating_allowed(hba))
>>  			ufshcd_setup_clocks(hba, true);
>>  		ufshcd_release(hba);
>> -		pm_op = hba->is_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
>> +		pm_op = hba->is_wl_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
>>  		ufshcd_vops_resume(hba, pm_op);
>>  	} else {
>>  		ufshcd_hold(hba, false);
>> @@ -5976,7 +5978,7 @@ static void ufshcd_recover_pm_error(struct 
>> ufs_hba *hba)
>>  	struct request_queue *q;
>>  	int ret;
>> 
>> -	hba->is_sys_suspended = false;
>> +	hba->is_wl_sys_suspended = false;
>>  	/*
>>  	 * Set RPM status of wlun device to RPM_ACTIVE,
>>  	 * this also clears its runtime error.
>> @@ -8784,7 +8786,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>>  	enum ufs_dev_pwr_mode req_dev_pwr_mode;
>>  	enum uic_link_state req_link_state;
>> 
>> -	hba->pm_op_in_progress = true;
>> +	hba->wl_pm_op_in_progress = true;
>>  	if (pm_op != UFS_SHUTDOWN_PM) {
>>  		pm_lvl = pm_op == UFS_RUNTIME_PM ?
>>  			 hba->rpm_lvl : hba->spm_lvl;
>> @@ -8919,7 +8921,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>>  		hba->clk_gating.is_suspended = false;
>>  		ufshcd_release(hba);
>>  	}
>> -	hba->pm_op_in_progress = false;
>> +	hba->wl_pm_op_in_progress = false;
>>  	return ret;
>>  }
>> 
>> @@ -8928,7 +8930,7 @@ static int __ufshcd_wl_resume(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>>  	int ret;
>>  	enum uic_link_state old_link_state = hba->uic_link_state;
>> 
>> -	hba->pm_op_in_progress = true;
>> +	hba->wl_pm_op_in_progress = true;
>> 
>>  	/*
>>  	 * Call vendor specific resume callback. As these callbacks may 
>> access
>> @@ -9006,7 +9008,7 @@ static int __ufshcd_wl_resume(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>>  		ufshcd_update_evt_hist(hba, UFS_EVT_WL_RES_ERR, (u32)ret);
>>  	hba->clk_gating.is_suspended = false;
>>  	ufshcd_release(hba);
>> -	hba->pm_op_in_progress = false;
>> +	hba->wl_pm_op_in_progress = false;
>>  	return ret;
>>  }
>> 
>> @@ -9072,7 +9074,7 @@ static int ufshcd_wl_suspend(struct device *dev)
>> 
>>  out:
>>  	if (!ret)
>> -		hba->is_sys_suspended = true;
>> +		hba->is_wl_sys_suspended = true;
>>  	trace_ufshcd_wl_suspend(dev_name(dev), ret,
>>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>> @@ -9100,7 +9102,7 @@ static int ufshcd_wl_resume(struct device *dev)
>>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>>  	if (!ret)
>> -		hba->is_sys_suspended = false;
>> +		hba->is_wl_sys_suspended = false;
>>  	up(&hba->host_sem);
>>  	return ret;
>>  }
>> @@ -9141,6 +9143,8 @@ static int ufshcd_suspend(struct ufs_hba *hba)
>> 
>>  	if (!hba->is_powered)
>>  		return 0;
>> +
>> +	hba->pm_op_in_progress = true;
>>  	/*
>>  	 * Disable the host irq as host controller as there won't be any
>>  	 * host controller transaction expected till resume.
>> @@ -9160,6 +9164,7 @@ static int ufshcd_suspend(struct ufs_hba *hba)
>>  	ufshcd_vreg_set_lpm(hba);
>>  	/* Put the host controller in low power mode if possible */
>>  	ufshcd_hba_vreg_set_lpm(hba);
>> +	hba->pm_op_in_progress = false;
>>  	return ret;
>>  }
>> 
>> @@ -9179,6 +9184,7 @@ static int ufshcd_resume(struct ufs_hba *hba)
>>  	if (!hba->is_powered)
>>  		return 0;
>> 
>> +	hba->pm_op_in_progress = true;
>>  	ufshcd_hba_vreg_set_hpm(hba);
>>  	ret = ufshcd_vreg_set_hpm(hba);
>>  	if (ret)
>> @@ -9198,6 +9204,7 @@ static int ufshcd_resume(struct ufs_hba *hba)
>>  out:
>>  	if (ret)
>>  		ufshcd_update_evt_hist(hba, UFS_EVT_RESUME_ERR, (u32)ret);
>> +	hba->pm_op_in_progress = false;
>>  	return ret;
>>  }
>> 
>> @@ -9222,6 +9229,10 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
>>  	trace_ufshcd_system_suspend(dev_name(hba->dev), ret,
>>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>> +
>> +	if (!ret)
>> +		hba->is_sys_suspended = true;
>> +
>>  	return ret;
>>  }
>>  EXPORT_SYMBOL(ufshcd_system_suspend);
>> @@ -9248,6 +9259,9 @@ int ufshcd_system_resume(struct ufs_hba *hba)
>>  		ktime_to_us(ktime_sub(ktime_get(), start)),
>>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>> 
>> +	if (!ret)
>> +		hba->is_sys_suspended = false;
>> +
>>  	return ret;
>>  }
>>  EXPORT_SYMBOL(ufshcd_system_resume);
>> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
>> index c98d540..eaebb4e 100644
>> --- a/drivers/scsi/ufs/ufshcd.h
>> +++ b/drivers/scsi/ufs/ufshcd.h
>> @@ -752,7 +752,8 @@ struct ufs_hba {
>>  	enum ufs_pm_level spm_lvl;
>>  	struct device_attribute rpm_lvl_attr;
>>  	struct device_attribute spm_lvl_attr;
>> -	int pm_op_in_progress;
>> +	bool pm_op_in_progress;
>> +	bool wl_pm_op_in_progress;
>> 
>>  	/* Auto-Hibernate Idle Timer register value */
>>  	u32 ahit;
>> @@ -839,6 +840,7 @@ struct ufs_hba {
>>  	struct devfreq *devfreq;
>>  	struct ufs_clk_scaling clk_scaling;
>>  	bool is_sys_suspended;
>> +	bool is_wl_sys_suspended;
>> 
>>  	enum bkops_status urgent_bkops_lvl;
>>  	bool is_urgent_bkops_lvl_checked;
>> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation
  2021-06-10 12:30   ` Adrian Hunter
@ 2021-06-11  3:01     ` Can Guo
  2021-06-11 20:58       ` Bart Van Assche
  0 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-11  3:01 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Hi Adrian,

On 2021-06-10 20:30, Adrian Hunter wrote:
> On 10/06/21 7:43 am, Can Guo wrote:
>> Commit cb7e6f05fce67c965194ac04467e1ba7bc70b069 ("scsi: ufs: core: 
>> Enable
>> power management for wlun") moves UFS operations out of 
>> ufshcd_resume(), so
>> in error handling preparation, if ufshcd hba has failed to resume, 
>> there is
>> no point to re-enable IRQ/clk/pwr.
> 
> I am not sure how cb7e6f05fce67c965194ac04467e1ba7bc70b069 made things 
> any
> different,

Previously, without commit cb7e6f05fce67c965194ac04467e1ba7bc70b069, 
ufshcd_resume()
may turn off pwr and clk due to UFS error, e.g., link transition failure 
and SSU
error/abort (and these UFS error would invoke error handling).  When 
error handling
kicks start, it should re-enable the pwr and clk before proceeding. Now, 
commit
cb7e6f05fce67c965194ac04467e1ba7bc70b069 makes ufshcd_resume() purely 
control pwr and
clk, meaning if ufshcd_resume() fails, there is nothing we can do about 
it - pwr or
clk enabling must have failed, and it is not because of UFS error. This 
is why I am
removing the re-enabling pwr/clk in error handling prepare.

> but what I really wonder is why we don't just do recovery
> directly in __ufshcd_wl_suspend() and  __ufshcd_wl_resume() and strip 
> all
> the PM complexity out of ufshcd_err_handling()?
> 

This is a good question and I've been strugled with this idea ever since 
I
started to fix error handling.

Just so you know, there are runtime and system suspend/resume. And error
handling has the same nature of user access - it is unpredictable, 
meaning it
can be invoked at any time (from IRQ handler), even when there is no 
ongoing
cmd/data transactions (like auto hibern8 failure and UIC errors, such as 
DME
error and some errors in data link layer) [1], unless you disable UFS 
IRQ.

For runtime suspend/resume, it is fine, since we call 
pm_runtime_get/put_sync() in
error handling - error handling won't run into parallel with runtime 
suspend/resume.

For system suspend/resume, since error handling has the same nature like 
user
access, so we are using host_sem to avoid concurrency of error handling 
and
system suspend/resume.

Back to your question - can we just do recovery directly in 
__ufshcd_wl_suspend()
and __ufshcd_wl_resume()? Yes, we can.

However, the reasons why I choose not to do it that way are (althrough 
error
handler prepare has became much more simple after apply this change)

1. I want to keep all the complexity within error handler, and re-direct 
all error
recovery needs to error handler. It can avoid calling 
ufshcd_reset_and_restore()
and/or flush_work(&hba->eh_work) here and there. The entire UFS 
suspend/resume is
already complex enough, I don't want to mess up with it.

2. We do explicit recovery only when we see certain errors, e.g., H8 
enter func
returns an error during suspend, but as mentioned above [1], error 
handling can
be invoked already from IRQ handler (due to all kinds of UIC errors 
before H8 enter
func returns). So, we still need host_sem (in case of system 
suspend/resume) to
avoid concurrency.

3. During system suspend/resume, error handling can be invoked (due to 
non-fatal
errors) but still UFS cmds return no error at all. Similar like above, 
we need
host_sem to avoid concurrency.

There are more reasons why I chose this way, but it is really this way 
or others.
I am glad to see someone cares about error handling and can make it 
better and
more robust, no matter what that way is. :)

Thanks,
Can Guo.

>> 
>> Signed-off-by: Can Guo <cang@codeaurora.org>
>> ---
>>  drivers/scsi/ufs/ufshcd.c | 58 
>> +++++++++++++++++++++++++----------------------
>>  1 file changed, 31 insertions(+), 27 deletions(-)
>> 
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index 7dc0fda..0afad6b 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -2727,8 +2727,8 @@ static int ufshcd_queuecommand(struct Scsi_Host 
>> *host, struct scsi_cmnd *cmd)
>>  		break;
>>  	case UFSHCD_STATE_EH_SCHEDULED_FATAL:
>>  		/*
>> -		 * pm_runtime_get_sync() is used at error handling preparation
>> -		 * stage. If a scsi cmd, e.g. the SSU cmd, is sent from hba's
>> +		 * ufshcd_rpm_get_sync() is used at error handling preparation
>> +		 * stage. If a scsi cmd, e.g., the SSU cmd, is sent from the
>>  		 * PM ops, it can never be finished if we let SCSI layer keep
>>  		 * retrying it, which gets err handler stuck forever. Neither
>>  		 * can we let the scsi cmd pass through, because UFS is in bad
>> @@ -5915,29 +5915,26 @@ static void ufshcd_clk_scaling_suspend(struct 
>> ufs_hba *hba, bool suspend)
>>  	}
>>  }
>> 
>> -static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>> +static int ufshcd_err_handling_prepare(struct ufs_hba *hba)
>>  {
>> +	/*
>> +	 * Exclusively call pm_runtime_get_sync(hba->dev) once, in case
>> +	 * following ufshcd_rpm_get_sync() fails.
>> +	 */
>> +	pm_runtime_get_sync(hba->dev);
>> +	/* End of the world. */
>> +	if (pm_runtime_suspended(hba->dev)) {
>> +		pm_runtime_put(hba->dev);
>> +		return -EINVAL;
>> +	}
>> +
>> +	ufshcd_set_eh_in_progress(hba);
>>  	ufshcd_rpm_get_sync(hba);
>> -	if (pm_runtime_status_suspended(&hba->sdev_ufs_device->sdev_gendev) 
>> ||
>> +	if (pm_runtime_suspended(&hba->sdev_ufs_device->sdev_gendev) ||
>>  	    hba->is_wl_sys_suspended) {
>> -		enum ufs_pm_op pm_op;
>> +		enum ufs_pm_op pm_op = hba->is_wl_sys_suspended ?
>> +				       UFS_SYSTEM_PM : UFS_RUNTIME_PM;
>> 
>> -		/*
>> -		 * Don't assume anything of resume, if
>> -		 * resume fails, irq and clocks can be OFF, and powers
>> -		 * can be OFF or in LPM.
>> -		 */
>> -		ufshcd_setup_hba_vreg(hba, true);
>> -		ufshcd_setup_vreg(hba, true);
>> -		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq);
>> -		ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq2);
>> -		ufshcd_hold(hba, false);
>> -		if (!ufshcd_is_clkgating_allowed(hba)) {
>> -			ufshcd_setup_clocks(hba, true);
>> -			ufshcd_enable_irq(hba);
>> -		}
>> -		ufshcd_release(hba);
>> -		pm_op = hba->is_wl_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
>>  		ufshcd_vops_resume(hba, pm_op);
>>  	} else {
>>  		ufshcd_hold(hba, false);
>> @@ -5951,22 +5948,25 @@ static void ufshcd_err_handling_prepare(struct 
>> ufs_hba *hba)
>>  	down_write(&hba->clk_scaling_lock);
>>  	up_write(&hba->clk_scaling_lock);
>>  	cancel_work_sync(&hba->eeh_work);
>> +	return 0;
>>  }
>> 
>>  static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
>>  {
>> +	ufshcd_clear_eh_in_progress(hba);
>>  	ufshcd_scsi_unblock_requests(hba);
>>  	ufshcd_release(hba);
>>  	if (ufshcd_is_clkscaling_supported(hba))
>>  		ufshcd_clk_scaling_suspend(hba, false);
>>  	ufshcd_clear_ua_wluns(hba);
>>  	ufshcd_rpm_put(hba);
>> +	pm_runtime_put(hba->dev);
>>  }
>> 
>>  static inline bool ufshcd_err_handling_should_stop(struct ufs_hba 
>> *hba)
>>  {
>>  	return (!hba->is_powered || hba->shutting_down ||
>> -		!hba->sdev_ufs_device ||
>> +		!hba->sdev_ufs_device || hba->is_sys_suspended ||
>>  		hba->ufshcd_state == UFSHCD_STATE_ERROR ||
>>  		(!(hba->saved_err || hba->saved_uic_err || hba->force_reset ||
>>  		   ufshcd_is_link_broken(hba))));
>> @@ -6052,9 +6052,13 @@ static void ufshcd_err_handler(struct 
>> work_struct *work)
>>  		up(&hba->host_sem);
>>  		return;
>>  	}
>> -	ufshcd_set_eh_in_progress(hba);
>>  	spin_unlock_irqrestore(hba->host->host_lock, flags);
>> -	ufshcd_err_handling_prepare(hba);
>> +	if (ufshcd_err_handling_prepare(hba)) {
>> +		dev_err(hba->dev, "%s: error handling preparation failed\n",
>> +				__func__);
>> +		up(&hba->host_sem);
>> +		return;
>> +	}
>>  	/* Complete requests that have door-bell cleared by h/w */
>>  	ufshcd_complete_requests(hba);
>>  	spin_lock_irqsave(hba->host->host_lock, flags);
>> @@ -6198,7 +6202,6 @@ static void ufshcd_err_handler(struct 
>> work_struct *work)
>>  			dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x 
>> saved_uic_err 0x%x",
>>  			    __func__, hba->saved_err, hba->saved_uic_err);
>>  	}
>> -	ufshcd_clear_eh_in_progress(hba);
>>  	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>  	ufshcd_err_handling_unprepare(hba);
>>  	up(&hba->host_sem);
>> @@ -8999,6 +9002,9 @@ static int __ufshcd_wl_resume(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>> 
>>  	/* Enable Auto-Hibernate if configured */
>>  	ufshcd_auto_hibern8_enable(hba);
>> +
>> +	hba->clk_gating.is_suspended = false;
>> +	ufshcd_release(hba);
>>  	goto out;
>> 
>>  set_old_link_state:
>> @@ -9008,8 +9014,6 @@ static int __ufshcd_wl_resume(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>>  out:
>>  	if (ret)
>>  		ufshcd_update_evt_hist(hba, UFS_EVT_WL_RES_ERR, (u32)ret);
>> -	hba->clk_gating.is_suspended = false;
>> -	ufshcd_release(hba);
>>  	hba->wl_pm_op_in_progress = false;
>>  	return ret <= 0 ? ret : -EINVAL;
>>  }
>> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume
  2021-06-10 13:32   ` Adrian Hunter
@ 2021-06-11  3:06     ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-11  3:06 UTC (permalink / raw)
  To: Adrian Hunter
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Kiwoong Kim, Satya Tangirala, Bart Van Assche, open list

Hi Adrian,

On 2021-06-10 21:32, Adrian Hunter wrote:
> On 10/06/21 7:43 am, Can Guo wrote:
>> UFS error handling now is doing more than just re-probing, but also 
>> sending
>> scsi cmds, e.g., for clearing UACs, and recovering runtime PM error, 
>> which
>> may change runtime status of scsi devices. To protect system 
>> suspend/resume
>> from being disturbed by error handling, move the host_sem from wl pm 
>> ops
>> to ufshcd_suspend_prepare() and ufshcd_resume_complete().
> 
> Have you checked whether error handling might actually be needed after
> ufshcd_suspend_prepare()?

I intend to make it this (simple) way - if error handling is invoked
during system suspend/resume, it should just wait until system resume
is finished. suspend/resume does not count on error handling, if 
suspend/resume
run into errors, they just fail and bail.

> 
> Wouldn't this complexity go away if we just did recovery
> directly in __ufshcd_wl_suspend() and  __ufshcd_wl_resume()?
> 

Please kindly check my reply in patch #5.

Thanks,
Can Guo.

>> 
>> Signed-off-by: Can Guo <cang@codeaurora.org>
>> ---
>>  drivers/scsi/ufs/ufshcd.c | 8 +++-----
>>  drivers/scsi/ufs/ufshcd.h | 2 +-
>>  2 files changed, 4 insertions(+), 6 deletions(-)
>> 
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index c418a19..861942b 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -9060,16 +9060,13 @@ static int ufshcd_wl_suspend(struct device 
>> *dev)
>>  	ktime_t start = ktime_get();
>> 
>>  	hba = shost_priv(sdev->host);
>> -	down(&hba->host_sem);
>> 
>>  	if (pm_runtime_suspended(dev))
>>  		goto out;
>> 
>>  	ret = __ufshcd_wl_suspend(hba, UFS_SYSTEM_PM);
>> -	if (ret) {
>> +	if (ret)
>>  		dev_err(&sdev->sdev_gendev, "%s failed: %d\n", __func__,  ret);
>> -		up(&hba->host_sem);
>> -	}
>> 
>>  out:
>>  	if (!ret)
>> @@ -9102,7 +9099,6 @@ static int ufshcd_wl_resume(struct device *dev)
>>  		hba->curr_dev_pwr_mode, hba->uic_link_state);
>>  	if (!ret)
>>  		hba->is_wl_sys_suspended = false;
>> -	up(&hba->host_sem);
>>  	return ret;
>>  }
>>  #endif
>> @@ -9665,6 +9661,7 @@ void ufshcd_resume_complete(struct device *dev)
>>  		ufshcd_rpmb_rpm_put(hba);
>>  		hba->rpmb_complete_put = false;
>>  	}
>> +	up(&hba->host_sem);
>>  }
>>  EXPORT_SYMBOL_GPL(ufshcd_resume_complete);
>> 
>> @@ -9691,6 +9688,7 @@ int ufshcd_suspend_prepare(struct device *dev)
>>  		ufshcd_rpmb_rpm_get_sync(hba);
>>  		hba->rpmb_complete_put = true;
>>  	}
>> +	down(&hba->host_sem);
>>  	return 0;
>>  }
>>  EXPORT_SYMBOL_GPL(ufshcd_suspend_prepare);
>> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
>> index eaebb4e..47da47c 100644
>> --- a/drivers/scsi/ufs/ufshcd.h
>> +++ b/drivers/scsi/ufs/ufshcd.h
>> @@ -693,7 +693,7 @@ struct ufs_hba_monitor {
>>   * @ee_ctrl_mask: Exception event control mask
>>   * @is_powered: flag to check if HBA is powered
>>   * @shutting_down: flag to check if shutdown has been invoked
>> - * @host_sem: semaphore used to serialize concurrent contexts
>> + * @host_sem: semaphore used to avoid concurrency of contexts
>>   * @eh_wq: Workqueue that eh_work works on
>>   * @eh_work: Worker to handle UFS errors that require s/w attention
>>   * @eeh_work: Worker to handle exception events
>> 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops
  2021-06-10  4:43 ` [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops Can Guo
  2021-06-10 11:15   ` Adrian Hunter
@ 2021-06-11 20:40   ` Bart Van Assche
  2021-06-12  6:20     ` Can Guo
  2021-06-16 17:50   ` Bart Van Assche
  2 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-11 20:40 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, open list

On 6/9/21 9:43 PM, Can Guo wrote:
> Put pm_op_in_progress and is_sys_suspend flags back to ufshcd hba pm ops,
> add two new flags, namely wl_pm_op_in_progress and is_wl_sys_suspended, to
> track the UFS device W-LU pm ops. This helps us differentiate the status of
> hba and wl pm ops when we need to do troubleshooting.

Since "WL" is an uncommon abbreviation, please add a comment above the
definition of struct ufs_hba that explains the meaning of the new member
variables.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand
  2021-06-10  4:43 ` [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand Can Guo
@ 2021-06-11 20:52   ` Bart Van Assche
  2021-06-12  7:38     ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-11 20:52 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/9/21 9:43 PM, Can Guo wrote:
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 0c9d2ee..7dc0fda 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -2758,6 +2758,16 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>  		goto out;
>  	}
>  
> +	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
> +		if (hba->wl_pm_op_in_progress) {
> +			set_host_byte(cmd, DID_BAD_TARGET);
> +			cmd->scsi_done(cmd);
> +		} else {
> +			err = SCSI_MLQUEUE_HOST_BUSY;
> +		}
> +		goto out;
> +	}
> +
>  	hba->req_abort_count = 0;
>  
>  	err = ufshcd_hold(hba, true);
> @@ -2768,15 +2778,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>  	WARN_ON(ufshcd_is_clkgating_allowed(hba) &&
>  		(hba->clk_gating.state != CLKS_ON));
>  
> -	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
> -		if (hba->wl_pm_op_in_progress)
> -			set_host_byte(cmd, DID_BAD_TARGET);
> -		else
> -			err = SCSI_MLQUEUE_HOST_BUSY;
> -		ufshcd_release(hba);
> -		goto out;
> -	}
> -
>  	lrbp = &hba->lrb[tag];
>  	WARN_ON(lrbp->cmd);
>  	lrbp->cmd = cmd;

Can the code under "if (unlikely(test_bit(tag,
&hba->outstanding_reqs)))" be deleted instead of moving it? I don't
think that it is useful to verify whether the block layer tag allocator
works correctly. Additionally, I'm not aware of any similar code in any
other SCSI LLD.

Thanks,

Bart.



^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation
  2021-06-11  3:01     ` Can Guo
@ 2021-06-11 20:58       ` Bart Van Assche
  2021-06-12  6:46         ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-11 20:58 UTC (permalink / raw)
  To: Can Guo, Adrian Hunter
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/10/21 8:01 PM, Can Guo wrote:
> Previously, without commit cb7e6f05fce67c965194ac04467e1ba7bc70b069, 
> ufshcd_resume() may turn off pwr and clk due to UFS error, e.g., link
> transition failure and SSU error/abort (and these UFS error would
> invoke error handling).  When error handling kicks start, it should
> re-enable the pwr and clk before proceeding. Now, commit 
> cb7e6f05fce67c965194ac04467e1ba7bc70b069 makes ufshcd_resume()
> purely control pwr and clk, meaning if ufshcd_resume() fails, there
> is nothing we can do about it - pwr or clk enabling must have failed,
> and it is not because of UFS error. This is why I am removing the
> re-enabling pwr/clk in error handling prepare.

Why are link transition failures handled in the error handler instead of
in the context where these errors are detected (ufshcd_resume())? Is it
even possible to recover from a link transition failure or does this
perhaps indicate a broken UFS controller?

>> but what I really wonder is why we don't just do recovery directly
>> in __ufshcd_wl_suspend() and  __ufshcd_wl_resume() and strip all 
>> the PM complexity out of ufshcd_err_handling()?

+1

> For system suspend/resume, since error handling has the same nature
> like user access, so we are using host_sem to avoid concurrency of
> error handling and system suspend/resume.

Why is host_sem used for that purpose instead of lock_system_sleep() and
unlock_system_sleep()?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume
  2021-06-10  4:43 ` [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume Can Guo
  2021-06-10 13:32   ` Adrian Hunter
@ 2021-06-11 21:00   ` Bart Van Assche
  2021-06-12  6:46     ` Can Guo
  1 sibling, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-11 21:00 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, open list

On 6/9/21 9:43 PM, Can Guo wrote:
> UFS error handling now is doing more than just re-probing, but also sending
> scsi cmds, e.g., for clearing UACs, and recovering runtime PM error, which
> may change runtime status of scsi devices. To protect system suspend/resume
> from being disturbed by error handling, move the host_sem from wl pm ops
> to ufshcd_suspend_prepare() and ufshcd_resume_complete().

If lock_system_sleep() and unlock_system_sleep() would be used in the
error handler, would that allow to remove host_sem?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-10  4:43 ` [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests Can Guo
@ 2021-06-11 21:02   ` Bart Van Assche
  2021-06-12  7:07     ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-11 21:02 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/9/21 9:43 PM, Can Guo wrote:
> If PM requests fail during runtime suspend/resume, RPM framework saves the
> error to dev->power.runtime_error. Before the runtime_error gets cleared,
> runtime PM on this specific device won't work again, leaving the device
> either runtime active or runtime suspended permanently.
> 
> When task abort happens to a PM request sent during runtime suspend/resume,
> even if it can be successfully aborted, RPM framework anyways saves the
> (TIMEOUT) error. In this situation, we can leverage error handling to
> recover and clear the runtime_error. So, let PM requests take the fast
> abort path in ufshcd_abort().

How can a PM request fail during runtime suspend/resume? Does such a
failure perhaps indicate an UFS controller bug? I appreciate your work
but I'm wondering whether it's worth to complicate the UFS driver for
issues that should be fixed in the controller instead of in software.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 9/9] scsi: ufs: Apply more limitations to user access
  2021-06-10  4:43 ` [PATCH v3 9/9] scsi: ufs: Apply more limitations to user access Can Guo
@ 2021-06-11 21:03   ` Bart Van Assche
  2021-06-12  7:13     ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-11 21:03 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Adrian Hunter, Bean Huo, Stanley Chu,
	Keoseong Park, Jaegeuk Kim, Dinghao Liu, Gustavo A. R. Silva,
	Kiwoong Kim, Satya Tangirala, open list

On 6/9/21 9:43 PM, Can Guo wrote:
> Do not let user access HW if hba resume fails or hba is not in good state,
> otherwise it may lead to various stability issues.

Just like for the previous patch, I'm wondering whether or not such a
failure perhaps indicates a hardware bug?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops
  2021-06-11 20:40   ` Bart Van Assche
@ 2021-06-12  6:20     ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-12  6:20 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, open list

On 2021-06-12 04:40, Bart Van Assche wrote:
> On 6/9/21 9:43 PM, Can Guo wrote:
>> Put pm_op_in_progress and is_sys_suspend flags back to ufshcd hba pm 
>> ops,
>> add two new flags, namely wl_pm_op_in_progress and 
>> is_wl_sys_suspended, to
>> track the UFS device W-LU pm ops. This helps us differentiate the 
>> status of
>> hba and wl pm ops when we need to do troubleshooting.
> 
> Since "WL" is an uncommon abbreviation, please add a comment above the
> definition of struct ufs_hba that explains the meaning of the new 
> member
> variables.

Sure, will add in next version.

Thanks,
Can Guo.

> 
> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation
  2021-06-11 20:58       ` Bart Van Assche
@ 2021-06-12  6:46         ` Can Guo
  2021-06-12  9:49           ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-12  6:46 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Adrian Hunter, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi,
	kernel-team, Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 2021-06-12 04:58, Bart Van Assche wrote:
> On 6/10/21 8:01 PM, Can Guo wrote:
>> Previously, without commit cb7e6f05fce67c965194ac04467e1ba7bc70b069,
>> ufshcd_resume() may turn off pwr and clk due to UFS error, e.g., link
>> transition failure and SSU error/abort (and these UFS error would
>> invoke error handling).  When error handling kicks start, it should
>> re-enable the pwr and clk before proceeding. Now, commit
>> cb7e6f05fce67c965194ac04467e1ba7bc70b069 makes ufshcd_resume()
>> purely control pwr and clk, meaning if ufshcd_resume() fails, there
>> is nothing we can do about it - pwr or clk enabling must have failed,
>> and it is not because of UFS error. This is why I am removing the
>> re-enabling pwr/clk in error handling prepare.
> 
> Why are link transition failures handled in the error handler instead 
> of
> in the context where these errors are detected (ufshcd_resume())? Is it
> even possible to recover from a link transition failure or does this
> perhaps indicate a broken UFS controller?

Basically, almost all UFS failures are caused by errors in underlaying 
layers,
i.e., UIC errors, including link transition failures. And according to 
UFSHCI
spec, SW should do a full reset to recover it, just like handle any 
other
fatal UIC errors. All UIC errors are detected by HW and reported by IRQ 
handler.

UFSHCI Spec Ver. 31
8.2.7 Hibernate Enter/Exit Error Handling
Hibernate Enter/Exit Error occurs when the UniPro link is broken. When 
this condition occurs,
host software should reset the host controller by setting register HCE 
to ‘0’, re-initialize the host
controller by setting register HCE to ‘1', and then start link startup 
sequence as shown in Figure 16.

> 
>>> but what I really wonder is why we don't just do recovery directly
>>> in __ufshcd_wl_suspend() and  __ufshcd_wl_resume() and strip all
>>> the PM complexity out of ufshcd_err_handling()?
> 
> +1

I've explained why I chose not to do this in my last reply to Adrian.
Please kindly check it.

> 
>> For system suspend/resume, since error handling has the same nature
>> like user access, so we are using host_sem to avoid concurrency of
>> error handling and system suspend/resume.
> 
> Why is host_sem used for that purpose instead of lock_system_sleep() 
> and
> unlock_system_sleep()?
> 

I was aware of it, but the situation is that host_sem is also used to
avoid concurrency among user access, error handling and shutdown, so
I think just use host_sem anyways to simply the lockings, otherwise
user access and error handling would have to take both 
system_transition_mutex
and host_sem

Thanks,

Can Guo.

> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume
  2021-06-11 21:00   ` Bart Van Assche
@ 2021-06-12  6:46     ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-12  6:46 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, open list

On 2021-06-12 05:00, Bart Van Assche wrote:
> On 6/9/21 9:43 PM, Can Guo wrote:
>> UFS error handling now is doing more than just re-probing, but also 
>> sending
>> scsi cmds, e.g., for clearing UACs, and recovering runtime PM error, 
>> which
>> may change runtime status of scsi devices. To protect system 
>> suspend/resume
>> from being disturbed by error handling, move the host_sem from wl pm 
>> ops
>> to ufshcd_suspend_prepare() and ufshcd_resume_complete().
> 
> If lock_system_sleep() and unlock_system_sleep() would be used in the
> error handler, would that allow to remove host_sem?

Please kindly check my reply in patch #5.

Thanks,

Can Guo.

> 
> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-11 21:02   ` Bart Van Assche
@ 2021-06-12  7:07     ` Can Guo
  2021-06-12 16:50       ` Bart Van Assche
  0 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-12  7:07 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 2021-06-12 05:02, Bart Van Assche wrote:
> On 6/9/21 9:43 PM, Can Guo wrote:
>> If PM requests fail during runtime suspend/resume, RPM framework saves 
>> the
>> error to dev->power.runtime_error. Before the runtime_error gets 
>> cleared,
>> runtime PM on this specific device won't work again, leaving the 
>> device
>> either runtime active or runtime suspended permanently.
>> 
>> When task abort happens to a PM request sent during runtime 
>> suspend/resume,
>> even if it can be successfully aborted, RPM framework anyways saves 
>> the
>> (TIMEOUT) error. In this situation, we can leverage error handling to
>> recover and clear the runtime_error. So, let PM requests take the fast
>> abort path in ufshcd_abort().
> 
> How can a PM request fail during runtime suspend/resume? Does such a
> failure perhaps indicate an UFS controller bug?

I've replied your similar question in previous series. I've seen too 
much
SSU cmd and SYNCHRONIZE_CACHE cmd timed out these years, 60s is not even
enough for them to complete. And you are right, most cases are that 
device
is not responding - UFS controller is busy with housekeeping.

> I appreciate your work
> but I'm wondering whether it's worth to complicate the UFS driver for
> issues that should be fixed in the controller instead of in software.
> 

Sigh... I also want my life and work to be easier... I agree with you.

In project bring up stage, we fix whatever error/bug/failure we face to
unblock the project, during which we only focus on and try to fix the 
very
first UFS error, but not quite care about the error recovery or what the
error can possibly cause (usually more UFS errors and system stability 
issues
follow the very first UFS error).

However, these years our customers tend to ask for more - they want UFS 
error
handling to recover everything whenever UFS error occurs, because they 
believe
it is the last line of defense after their products go out to market. So 
I took
a lot of effort fixing, testing and trying to make it robust. Now here 
we are.
FYI, I am on a tight schedule to have these UFS error handling changes 
ready in
Android12-5.10.

Thanks,

Can Guo.

> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 9/9] scsi: ufs: Apply more limitations to user access
  2021-06-11 21:03   ` Bart Van Assche
@ 2021-06-12  7:13     ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-12  7:13 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Adrian Hunter, Bean Huo, Stanley Chu,
	Keoseong Park, Jaegeuk Kim, Dinghao Liu, Gustavo A. R. Silva,
	Kiwoong Kim, Satya Tangirala, open list

On 2021-06-12 05:03, Bart Van Assche wrote:
> On 6/9/21 9:43 PM, Can Guo wrote:
>> Do not let user access HW if hba resume fails or hba is not in good 
>> state,
>> otherwise it may lead to various stability issues.
> 
> Just like for the previous patch, I'm wondering whether or not such a
> failure perhaps indicates a hardware bug?
> 

Indeed yes, but user access happens when power/clock is not ready will
lead to system stability issues, e.g., OCP or unclocked register access.

Nowadays, customers are heavily using UFS sysfs nodes during runtime,
so our test teams added quite a lot test scripts to simulate user access
to UFS sysfs nodes during their test.

Thanks,

Can Guo.

> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand
  2021-06-11 20:52   ` Bart Van Assche
@ 2021-06-12  7:38     ` Can Guo
  2021-06-12 15:50       ` Bart Van Assche
  0 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-12  7:38 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 2021-06-12 04:52, Bart Van Assche wrote:
> On 6/9/21 9:43 PM, Can Guo wrote:
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index 0c9d2ee..7dc0fda 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -2758,6 +2758,16 @@ static int ufshcd_queuecommand(struct Scsi_Host 
>> *host, struct scsi_cmnd *cmd)
>>  		goto out;
>>  	}
>> 
>> +	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
>> +		if (hba->wl_pm_op_in_progress) {
>> +			set_host_byte(cmd, DID_BAD_TARGET);
>> +			cmd->scsi_done(cmd);
>> +		} else {
>> +			err = SCSI_MLQUEUE_HOST_BUSY;
>> +		}
>> +		goto out;
>> +	}
>> +
>>  	hba->req_abort_count = 0;
>> 
>>  	err = ufshcd_hold(hba, true);
>> @@ -2768,15 +2778,6 @@ static int ufshcd_queuecommand(struct Scsi_Host 
>> *host, struct scsi_cmnd *cmd)
>>  	WARN_ON(ufshcd_is_clkgating_allowed(hba) &&
>>  		(hba->clk_gating.state != CLKS_ON));
>> 
>> -	if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
>> -		if (hba->wl_pm_op_in_progress)
>> -			set_host_byte(cmd, DID_BAD_TARGET);
>> -		else
>> -			err = SCSI_MLQUEUE_HOST_BUSY;
>> -		ufshcd_release(hba);
>> -		goto out;
>> -	}
>> -
>>  	lrbp = &hba->lrb[tag];
>>  	WARN_ON(lrbp->cmd);
>>  	lrbp->cmd = cmd;
> 
> Can the code under "if (unlikely(test_bit(tag,
> &hba->outstanding_reqs)))" be deleted instead of moving it? I don't
> think that it is useful to verify whether the block layer tag allocator
> works correctly. Additionally, I'm not aware of any similar code in any
> other SCSI LLD.
> 

ufshcd_abort() aborts PM requests differently from other requests -
it simply evicts the cmd from lrbp [1], schedules error handler and
returns SUCCESS (the reason why I am doing it this way is in patch #8).

After ufshcd_abort() returns, the tag shall be released, the logic
here is to prevent subsequent cmds re-use the lrbp [1] before error
handler recovers the device and host.

Thanks,

Can Guo.

> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation
  2021-06-12  6:46         ` Can Guo
@ 2021-06-12  9:49           ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-12  9:49 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Adrian Hunter, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi,
	kernel-team, Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Hi Bart,

On 2021-06-12 14:46, Can Guo wrote:
> On 2021-06-12 04:58, Bart Van Assche wrote:
>> On 6/10/21 8:01 PM, Can Guo wrote:
>>> Previously, without commit cb7e6f05fce67c965194ac04467e1ba7bc70b069,
>>> ufshcd_resume() may turn off pwr and clk due to UFS error, e.g., link
>>> transition failure and SSU error/abort (and these UFS error would
>>> invoke error handling).  When error handling kicks start, it should
>>> re-enable the pwr and clk before proceeding. Now, commit
>>> cb7e6f05fce67c965194ac04467e1ba7bc70b069 makes ufshcd_resume()
>>> purely control pwr and clk, meaning if ufshcd_resume() fails, there
>>> is nothing we can do about it - pwr or clk enabling must have failed,
>>> and it is not because of UFS error. This is why I am removing the
>>> re-enabling pwr/clk in error handling prepare.
>> 
>> Why are link transition failures handled in the error handler instead 
>> of
>> in the context where these errors are detected (ufshcd_resume())? Is 
>> it
>> even possible to recover from a link transition failure or does this
>> perhaps indicate a broken UFS controller?
> 
> Basically, almost all UFS failures are caused by errors in underlaying 
> layers,
> i.e., UIC errors, including link transition failures. And according to 
> UFSHCI
> spec, SW should do a full reset to recover it, just like handle any 
> other
> fatal UIC errors. All UIC errors are detected by HW and reported by IRQ 
> handler.
> 
> UFSHCI Spec Ver. 31
> 8.2.7 Hibernate Enter/Exit Error Handling
> Hibernate Enter/Exit Error occurs when the UniPro link is broken. When
> this condition occurs,
> host software should reset the host controller by setting register HCE
> to ‘0’, re-initialize the host
> controller by setting register HCE to ‘1', and then start link startup
> sequence as shown in Figure 16.
> 
>> 
>>>> but what I really wonder is why we don't just do recovery directly
>>>> in __ufshcd_wl_suspend() and  __ufshcd_wl_resume() and strip all
>>>> the PM complexity out of ufshcd_err_handling()?
>> 
>> +1
> 
> I've explained why I chose not to do this in my last reply to Adrian.
> Please kindly check it.
> 
>> 
>>> For system suspend/resume, since error handling has the same nature
>>> like user access, so we are using host_sem to avoid concurrency of
>>> error handling and system suspend/resume.
>> 
>> Why is host_sem used for that purpose instead of lock_system_sleep() 
>> and
>> unlock_system_sleep()?
>> 
> 
> I was aware of it, but the situation is that host_sem is also used to
> avoid concurrency among user access, error handling and shutdown, so
> I think just use host_sem anyways to simply the lockings, otherwise
> user access and error handling would have to take both 
> system_transition_mutex
> and host_sem

On second thought, I will take your suggestion to use 
lock_system_sleep()
and unlock_system_sleep() in error handler and remove the host_sem used
in suspend/resume, which can make the code more readable by keeping the
changes within error handler itself. However, please note that host_sem
will still be used to avoid concurrency of user access, error handler 
and
shutdown.

Thanks,
Can Guo.

> 
> Thanks,
> 
> Can Guo.
> 
>> Thanks,
>> 
>> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand
  2021-06-12  7:38     ` Can Guo
@ 2021-06-12 15:50       ` Bart Van Assche
  2021-06-13 13:30         ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-12 15:50 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/12/21 12:38 AM, Can Guo wrote:
> On 2021-06-12 04:52, Bart Van Assche wrote:
>> On 6/9/21 9:43 PM, Can Guo wrote:
>>> @@ -2768,15 +2778,6 @@ static int ufshcd_queuecommand(struct
>>> Scsi_Host *host, struct scsi_cmnd *cmd)
>>>      WARN_ON(ufshcd_is_clkgating_allowed(hba) &&
>>>          (hba->clk_gating.state != CLKS_ON));
>>>
>>> -    if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
>>> -        if (hba->wl_pm_op_in_progress)
>>> -            set_host_byte(cmd, DID_BAD_TARGET);
>>> -        else
>>> -            err = SCSI_MLQUEUE_HOST_BUSY;
>>> -        ufshcd_release(hba);
>>> -        goto out;
>>> -    }
>>> -
>>>      lrbp = &hba->lrb[tag];
>>>      WARN_ON(lrbp->cmd);
>>>      lrbp->cmd = cmd;
>>
>> Can the code under "if (unlikely(test_bit(tag,
>> &hba->outstanding_reqs)))" be deleted instead of moving it? I don't
>> think that it is useful to verify whether the block layer tag allocator
>> works correctly. Additionally, I'm not aware of any similar code in any
>> other SCSI LLD.
> 
> ufshcd_abort() aborts PM requests differently from other requests -
> it simply evicts the cmd from lrbp [1], schedules error handler and
> returns SUCCESS (the reason why I am doing it this way is in patch #8).
> 
> After ufshcd_abort() returns, the tag shall be released, the logic
> here is to prevent subsequent cmds re-use the lrbp [1] before error
> handler recovers the device and host.

Thanks for the background information. However, this approach sounds
cumbersome to me. For PM requests, please change the UFS driver such
that calling scsi_done() for aborted requests is postponed until error
handling has finished and delete the code shown above from
ufshcd_queuecommand().

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-12  7:07     ` Can Guo
@ 2021-06-12 16:50       ` Bart Van Assche
  2021-06-13 14:42         ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-12 16:50 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/12/21 12:07 AM, Can Guo wrote:
> Sigh... I also want my life and work to be easier...

How about reducing the number of states and state transitions in the UFS
driver?

One source of complexity is that ufshcd_err_handler() is scheduled
independently of the SCSI error handler and hence may run concurrently
with the SCSI error handler. Has the following already been considered?
- Call ufshcd_err_handler() synchronously from ufshcd_abort() and
ufshcd_eh_host_reset_handler() instead of asynchronously.
- Call scsi_schedule_eh() from ufshcd_uic_pwr_ctrl() and
ufshcd_check_errors() instead of ufshcd_schedule_eh_work().

These changes will guarantee that all commands have completed or timed
out before ufshcd_err_handler() is called. I think that would allow to
remove e.g. the following code from the error handler:

	ufshcd_scsi_block_requests(hba);
	/* Drain ufshcd_queuecommand() */
	down_write(&hba->clk_scaling_lock);
	up_write(&hba->clk_scaling_lock);

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand
  2021-06-12 15:50       ` Bart Van Assche
@ 2021-06-13 13:30         ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-13 13:30 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 2021-06-12 23:50, Bart Van Assche wrote:
> On 6/12/21 12:38 AM, Can Guo wrote:
>> On 2021-06-12 04:52, Bart Van Assche wrote:
>>> On 6/9/21 9:43 PM, Can Guo wrote:
>>>> @@ -2768,15 +2778,6 @@ static int ufshcd_queuecommand(struct
>>>> Scsi_Host *host, struct scsi_cmnd *cmd)
>>>>      WARN_ON(ufshcd_is_clkgating_allowed(hba) &&
>>>>          (hba->clk_gating.state != CLKS_ON));
>>>> 
>>>> -    if (unlikely(test_bit(tag, &hba->outstanding_reqs))) {
>>>> -        if (hba->wl_pm_op_in_progress)
>>>> -            set_host_byte(cmd, DID_BAD_TARGET);
>>>> -        else
>>>> -            err = SCSI_MLQUEUE_HOST_BUSY;
>>>> -        ufshcd_release(hba);
>>>> -        goto out;
>>>> -    }
>>>> -
>>>>      lrbp = &hba->lrb[tag];
>>>>      WARN_ON(lrbp->cmd);
>>>>      lrbp->cmd = cmd;
>>> 
>>> Can the code under "if (unlikely(test_bit(tag,
>>> &hba->outstanding_reqs)))" be deleted instead of moving it? I don't
>>> think that it is useful to verify whether the block layer tag 
>>> allocator
>>> works correctly. Additionally, I'm not aware of any similar code in 
>>> any
>>> other SCSI LLD.
>> 
>> ufshcd_abort() aborts PM requests differently from other requests -
>> it simply evicts the cmd from lrbp [1], schedules error handler and
>> returns SUCCESS (the reason why I am doing it this way is in patch 
>> #8).
>> 
>> After ufshcd_abort() returns, the tag shall be released, the logic
>> here is to prevent subsequent cmds re-use the lrbp [1] before error
>> handler recovers the device and host.
> 
> Thanks for the background information. However, this approach sounds
> cumbersome to me. For PM requests, please change the UFS driver such
> that calling scsi_done() for aborted requests is postponed until error
> handling has finished and delete the code shown above from
> ufshcd_queuecommand().

I will delete the code in next version, since I believe the hba_state
checks before the code is enough to achieve the same purpose, so this
code becomes redundant.

Thanks,

Can Guo.

> 
> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-12 16:50       ` Bart Van Assche
@ 2021-06-13 14:42         ` Can Guo
  2021-06-14 18:49           ` Bart Van Assche
  0 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-13 14:42 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Hi Bart,

On 2021-06-13 00:50, Bart Van Assche wrote:
> On 6/12/21 12:07 AM, Can Guo wrote:
>> Sigh... I also want my life and work to be easier...
> 
> How about reducing the number of states and state transitions in the 
> UFS
> driver? One source of complexity is that ufshcd_err_handler() is 
> scheduled
> independently of the SCSI error handler and hence may run concurrently
> with the SCSI error handler. Has the following already been considered?
> - Call ufshcd_err_handler() synchronously from ufshcd_abort() and
> ufshcd_eh_host_reset_handler() instead of asynchronously.

1. ufshcd_eh_host_reset_handler() invokes ufshcd_err_handler() and 
flushes
it, so it is synchronous. ufshcd_eh_host_reset_handler() used to call
reset_and_restore() directly, which can run concurrently with UFS error 
handler,
so I fixed it last year [1].

2. ufshcd_abort() invokes ufshcd_err_handler() synchronously can have a
live lock issue, which is why I chose the asynchronous way (from the 
first
day I started to fix error handling). The live lock happens when abort 
happens
to a PM request, e.g., a SSU cmd sent from suspend/resume. Because UFS 
error
handler is synchronized with suspend/resume (by calling 
pm_runtime_get_sync()
and lock_system_sleep()), the sequence is like:
[1] ufshcd_wl_resume() sends SSU cmd
[2] ufshcd_abort() calls UFS error handler
[3] UFS error handler calls lock_system_sleep() and 
pm_runtime_get_sync()

In above sequence, either lock_system_sleep() or pm_runtime_get_sync() 
shall
be blocked - [3] is blocked by [1], [2] is blocked by [3], while [1] is 
blocked by [2].

For PM requests, I chose to abort them fast to unblock suspend/resume,
suspend/resume shall fail of course, but UFS error handler recovers
PM errors anyways.

> - Call scsi_schedule_eh() from ufshcd_uic_pwr_ctrl() and
> ufshcd_check_errors() instead of ufshcd_schedule_eh_work().

When ufshcd_uic_pwr_ctrl() and/or ufshcd_check_errors() report errors,
usually they are fatal errors, according to UFSHCI spec, SW should 
re-probe
UFS to recover.

However scsi_schedule_eh() does more than that - scsi_unjam_host() sends
request sense cmd and calls scsi_eh_ready_devs(), while 
scsi_eh_ready_devs()
sends test unit ready cmd and calls all the way down to 
scsi_eh_device/target/
bus/host_reset(). But we only need scsi_eh_host_reset() in this case. I 
know
you have concerns that scsi_schedule_eh() may run concurrently with UFS 
error
handler, but as I mentioned above in [1] - I've made 
ufshcd_eh_host_reset_handler()
synchronized with UFS error handler, hope that can ease your concern.

I am not saying your idea won't work, it is a good suggestion. I will 
try
it after these changes go in, because it would require extra effort and 
the
effort won't be minor - I need to consider how to remove/reduce the 
ufshcd
states along with the change and the error injection and stability test 
all
over again, which is a long way to go. As for now, at least current 
changes
works well as per my test and we really need these changes for 
Andriod12-5.10.

Thanks,

Can Guo.

> 
> These changes will guarantee that all commands have completed or timed
> out before ufshcd_err_handler() is called. I think that would allow to
> remove e.g. the following code from the error handler:
> 
> 	ufshcd_scsi_block_requests(hba);
> 	/* Drain ufshcd_queuecommand() */
> 	down_write(&hba->clk_scaling_lock);
> 	up_write(&hba->clk_scaling_lock);
> 
> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-13 14:42         ` Can Guo
@ 2021-06-14 18:49           ` Bart Van Assche
  2021-06-15  2:36             ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-14 18:49 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/13/21 7:42 AM, Can Guo wrote:
> 2. ufshcd_abort() invokes ufshcd_err_handler() synchronously can have a
> live lock issue, which is why I chose the asynchronous way (from the first
> day I started to fix error handling). The live lock happens when abort
> happens
> to a PM request, e.g., a SSU cmd sent from suspend/resume. Because UFS
> error
> handler is synchronized with suspend/resume (by calling
> pm_runtime_get_sync()
> and lock_system_sleep()), the sequence is like:
> [1] ufshcd_wl_resume() sends SSU cmd
> [2] ufshcd_abort() calls UFS error handler
> [3] UFS error handler calls lock_system_sleep() and pm_runtime_get_sync()
> 
> In above sequence, either lock_system_sleep() or pm_runtime_get_sync()
> shall
> be blocked - [3] is blocked by [1], [2] is blocked by [3], while [1] is
> blocked by [2].
> 
> For PM requests, I chose to abort them fast to unblock suspend/resume,
> suspend/resume shall fail of course, but UFS error handler recovers
> PM errors anyways.

In the above sequence, does [2] perhaps refer to aborting the SSU
command submitted in step [1] (this is not clear to me)? If so, how
about breaking the circular waiting cycle as follows:
- If it can happen that SSU succeeds after more than scsi_timeout
  seconds, define a custom timeout handler. From inside the timeout
  handler, schedule a link check and return BLK_EH_RESET_TIMER. If the
  link is no longer operational, run the error handler. If the link
  cannot be recovered by the error handler, fail all pending commands.
  This will prevent that ufshcd_abort() is called if a SSU command takes
  longer than expected. See also commit 0dd0dec1677e.
- Modify the UFS error handler such that it accepts a context argument.
  The context argument specifies whether or not the UFS error handler is
  called from inside a system suspend or system resume handler. If the
  UFS error handler is called from inside a system suspend or resume
  callback, skip the lock_system_sleep() and unlock_system_sleep()
  calls.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-14 18:49           ` Bart Van Assche
@ 2021-06-15  2:36             ` Can Guo
  2021-06-15  3:17               ` Can Guo
  2021-06-15 18:25               ` Bart Van Assche
  0 siblings, 2 replies; 43+ messages in thread
From: Can Guo @ 2021-06-15  2:36 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Hi Bart,

On 2021-06-15 02:49, Bart Van Assche wrote:
> On 6/13/21 7:42 AM, Can Guo wrote:
>> 2. ufshcd_abort() invokes ufshcd_err_handler() synchronously can have 
>> a
>> live lock issue, which is why I chose the asynchronous way (from the 
>> first
>> day I started to fix error handling). The live lock happens when abort
>> happens
>> to a PM request, e.g., a SSU cmd sent from suspend/resume. Because UFS
>> error
>> handler is synchronized with suspend/resume (by calling
>> pm_runtime_get_sync()
>> and lock_system_sleep()), the sequence is like:
>> [1] ufshcd_wl_resume() sends SSU cmd
>> [2] ufshcd_abort() calls UFS error handler
>> [3] UFS error handler calls lock_system_sleep() and 
>> pm_runtime_get_sync()
>> 
>> In above sequence, either lock_system_sleep() or pm_runtime_get_sync()
>> shall
>> be blocked - [3] is blocked by [1], [2] is blocked by [3], while [1] 
>> is
>> blocked by [2].
>> 
>> For PM requests, I chose to abort them fast to unblock suspend/resume,
>> suspend/resume shall fail of course, but UFS error handler recovers
>> PM errors anyways.
> 
> In the above sequence, does [2] perhaps refer to aborting the SSU
> command submitted in step [1] (this is not clear to me)?

Yes, your understanding is right.

> If so, how about breaking the circular waiting cycle as follows:
> - If it can happen that SSU succeeds after more than scsi_timeout
>   seconds, define a custom timeout handler. From inside the timeout
>   handler, schedule a link check and return BLK_EH_RESET_TIMER. If the
>   link is no longer operational, run the error handler. If the link
>   cannot be recovered by the error handler, fail all pending commands.
>   This will prevent that ufshcd_abort() is called if a SSU command 
> takes
>   longer than expected. See also commit 0dd0dec1677e.
> - Modify the UFS error handler such that it accepts a context argument.
>   The context argument specifies whether or not the UFS error handler 
> is
>   called from inside a system suspend or system resume handler. If the
>   UFS error handler is called from inside a system suspend or resume
>   callback, skip the lock_system_sleep() and unlock_system_sleep()
>   calls.
> 

I am aware of commit 0dd0dec1677e, I gave my reviewed-by tag. Thank you
for your suggestion and I believe it can resolve the cycle, because 
actually
I've considered the similar way (leverage hba->host->eh_noresume) last 
year,
but I didn't take this way due to below reasons:

1. UFS error handler basically does one thing - reset and restore, which
stops hba [1], resets device [2] and re-probes the device [3]. Stopping 
hba [1]
shall complete any pending requests in the doorbell (with error or no 
error).
After [1], suspend/resume contexts, blocked by SSU cmd, shall be 
unblocked
right away to do whatever it needs to handle the SSU cmd failure 
(completed
in [1], so scsi_execute() returns an error), e.g., put link back to the 
old
state. call ufshcd_vops_suspend(), turn off irq/clocks/powers and etc...
However, reset and restore ([2] and [3]) is still running, and it can 
(most likely)
be disturbed by suspend/resume. So passing a parameter or using 
hba->host->eh_noresume
to skip lock_system_sleep() and unlock_system_sleep() can break the 
cycle,
but error handling may run concurrently with suspend/resume. Of course 
we can
modify suspend/resume to avoid it, but I was pursuing a minimal change 
to get this fixed.

2. Whatever way we take to break the cycle, suspend/resume shall fail 
and
RPM framework shall save the error to dev.power.runtime_error, leaving
the device in runtime suspended or active mode permanently. If it is 
left
runtime suspended, UFS driver won't accept cmd anymore, while if it is 
left
runtime active, powers of UFS device and host will be left ON, leading 
to power
penalty. So my main idea is to let suspend/resume contexts, blocked by 
PM cmds,
fail fast first and then error handler recover everything back to work.

Thanks,

Can Guo.

> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-15  2:36             ` Can Guo
@ 2021-06-15  3:17               ` Can Guo
  2021-06-15 18:25               ` Bart Van Assche
  1 sibling, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-15  3:17 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 2021-06-15 10:36, Can Guo wrote:
> Hi Bart,
> 
> On 2021-06-15 02:49, Bart Van Assche wrote:
>> On 6/13/21 7:42 AM, Can Guo wrote:
>>> 2. ufshcd_abort() invokes ufshcd_err_handler() synchronously can have 
>>> a
>>> live lock issue, which is why I chose the asynchronous way (from the 
>>> first
>>> day I started to fix error handling). The live lock happens when 
>>> abort
>>> happens
>>> to a PM request, e.g., a SSU cmd sent from suspend/resume. Because 
>>> UFS
>>> error
>>> handler is synchronized with suspend/resume (by calling
>>> pm_runtime_get_sync()
>>> and lock_system_sleep()), the sequence is like:
>>> [1] ufshcd_wl_resume() sends SSU cmd
>>> [2] ufshcd_abort() calls UFS error handler
>>> [3] UFS error handler calls lock_system_sleep() and 
>>> pm_runtime_get_sync()
>>> 
>>> In above sequence, either lock_system_sleep() or 
>>> pm_runtime_get_sync()
>>> shall
>>> be blocked - [3] is blocked by [1], [2] is blocked by [3], while [1] 
>>> is
>>> blocked by [2].
>>> 
>>> For PM requests, I chose to abort them fast to unblock 
>>> suspend/resume,
>>> suspend/resume shall fail of course, but UFS error handler recovers
>>> PM errors anyways.
>> 
>> In the above sequence, does [2] perhaps refer to aborting the SSU
>> command submitted in step [1] (this is not clear to me)?
> 
> Yes, your understanding is right.
> 
>> If so, how about breaking the circular waiting cycle as follows:
>> - If it can happen that SSU succeeds after more than scsi_timeout
>>   seconds, define a custom timeout handler. From inside the timeout
>>   handler, schedule a link check and return BLK_EH_RESET_TIMER. If the
>>   link is no longer operational, run the error handler. If the link
>>   cannot be recovered by the error handler, fail all pending commands.
>>   This will prevent that ufshcd_abort() is called if a SSU command 
>> takes
>>   longer than expected. See also commit 0dd0dec1677e.
>> - Modify the UFS error handler such that it accepts a context 
>> argument.
>>   The context argument specifies whether or not the UFS error handler 
>> is
>>   called from inside a system suspend or system resume handler. If the
>>   UFS error handler is called from inside a system suspend or resume
>>   callback, skip the lock_system_sleep() and unlock_system_sleep()
>>   calls.
>> 
> 
> I am aware of commit 0dd0dec1677e, I gave my reviewed-by tag. Thank you
> for your suggestion and I believe it can resolve the cycle, because 
> actually
> I've considered the similar way (leverage hba->host->eh_noresume) last 
> year,
> but I didn't take this way due to below reasons:
> 
> 1. UFS error handler basically does one thing - reset and restore, 
> which
> stops hba [1], resets device [2] and re-probes the device [3]. Stopping 
> hba [1]
> shall complete any pending requests in the doorbell (with error or no 
> error).
> After [1], suspend/resume contexts, blocked by SSU cmd, shall be 
> unblocked
> right away to do whatever it needs to handle the SSU cmd failure 
> (completed
> in [1], so scsi_execute() returns an error), e.g., put link back to the 
> old
> state. call ufshcd_vops_suspend(), turn off irq/clocks/powers and 
> etc...
> However, reset and restore ([2] and [3]) is still running, and it can
> (most likely)
> be disturbed by suspend/resume. So passing a parameter or using
> hba->host->eh_noresume
> to skip lock_system_sleep() and unlock_system_sleep() can break the 
> cycle,
> but error handling may run concurrently with suspend/resume. Of course 
> we can
> modify suspend/resume to avoid it, but I was pursuing a minimal change
> to get this fixed.
> 

Add more - besides, SSU cmd is not the only PM request sent during 
suspend/resume,
last year (before your changes came in) it also sends request sense cmd 
without
checking the return value of it - so if request sense cmd abort happens, 
suspend/resume
still move forward, which can run concurrently with error handling. So I 
was pursuing
a way to make error handler less dependent on the bahaviours of these 
contexts.

Thanks,

Can Guo.

> 2. Whatever way we take to break the cycle, suspend/resume shall fail 
> and
> RPM framework shall save the error to dev.power.runtime_error, leaving
> the device in runtime suspended or active mode permanently. If it is 
> left
> runtime suspended, UFS driver won't accept cmd anymore, while if it is 
> left
> runtime active, powers of UFS device and host will be left ON, leading 
> to power
> penalty. So my main idea is to let suspend/resume contexts, blocked by 
> PM cmds,
> fail fast first and then error handler recover everything back to work.
> 
> Thanks,
> 
> Can Guo.
> 
>> Thanks,
>> 
>> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-15  2:36             ` Can Guo
  2021-06-15  3:17               ` Can Guo
@ 2021-06-15 18:25               ` Bart Van Assche
  2021-06-16  4:00                 ` Can Guo
  1 sibling, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-15 18:25 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/14/21 7:36 PM, Can Guo wrote:
> I've considered the similar way (leverage hba->host->eh_noresume) last
> year,
> but I didn't take this way due to below reasons:
> 
> 1. UFS error handler basically does one thing - reset and restore, which
> stops hba [1], resets device [2] and re-probes the device [3]. Stopping
> hba [1]
> shall complete any pending requests in the doorbell (with error or no
> error).
> After [1], suspend/resume contexts, blocked by SSU cmd, shall be unblocked
> right away to do whatever it needs to handle the SSU cmd failure (completed
> in [1], so scsi_execute() returns an error), e.g., put link back to the old
> state. call ufshcd_vops_suspend(), turn off irq/clocks/powers and etc...
> However, reset and restore ([2] and [3]) is still running, and it can
> (most likely)
> be disturbed by suspend/resume. So passing a parameter or using
> hba->host->eh_noresume
> to skip lock_system_sleep() and unlock_system_sleep() can break the cycle,
> but error handling may run concurrently with suspend/resume. Of course
> we can
> modify suspend/resume to avoid it, but I was pursuing a minimal change
> to get this fixed.
> 
> 2. Whatever way we take to break the cycle, suspend/resume shall fail and
> RPM framework shall save the error to dev.power.runtime_error, leaving
> the device in runtime suspended or active mode permanently. If it is left
> runtime suspended, UFS driver won't accept cmd anymore, while if it is left
> runtime active, powers of UFS device and host will be left ON, leading
> to power
> penalty. So my main idea is to let suspend/resume contexts, blocked by
> PM cmds,
> fail fast first and then error handler recover everything back to work.

Hi Can,

Has it been considered to make the UFS error handler fail pending
commands with an error code that causes the SCSI core to resubmit the
SCSI command, e.g. DID_IMM_RETRY or DID_TRANSPORT_DISRUPTED? I want to
prevent that power management or suspend/resume callbacks fail if the
error handler succeeds with recovering the UFS transport.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-15 18:25               ` Bart Van Assche
@ 2021-06-16  4:00                 ` Can Guo
  2021-06-16  4:40                   ` Bart Van Assche
  0 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-16  4:00 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 2021-06-16 02:25, Bart Van Assche wrote:
> On 6/14/21 7:36 PM, Can Guo wrote:
>> I've considered the similar way (leverage hba->host->eh_noresume) last
>> year,
>> but I didn't take this way due to below reasons:
>> 
>> 1. UFS error handler basically does one thing - reset and restore, 
>> which
>> stops hba [1], resets device [2] and re-probes the device [3]. 
>> Stopping
>> hba [1]
>> shall complete any pending requests in the doorbell (with error or no
>> error).
>> After [1], suspend/resume contexts, blocked by SSU cmd, shall be 
>> unblocked
>> right away to do whatever it needs to handle the SSU cmd failure 
>> (completed
>> in [1], so scsi_execute() returns an error), e.g., put link back to 
>> the old
>> state. call ufshcd_vops_suspend(), turn off irq/clocks/powers and 
>> etc...
>> However, reset and restore ([2] and [3]) is still running, and it can
>> (most likely)
>> be disturbed by suspend/resume. So passing a parameter or using
>> hba->host->eh_noresume
>> to skip lock_system_sleep() and unlock_system_sleep() can break the 
>> cycle,
>> but error handling may run concurrently with suspend/resume. Of course
>> we can
>> modify suspend/resume to avoid it, but I was pursuing a minimal change
>> to get this fixed.
>> 
>> 2. Whatever way we take to break the cycle, suspend/resume shall fail 
>> and
>> RPM framework shall save the error to dev.power.runtime_error, leaving
>> the device in runtime suspended or active mode permanently. If it is 
>> left
>> runtime suspended, UFS driver won't accept cmd anymore, while if it is 
>> left
>> runtime active, powers of UFS device and host will be left ON, leading
>> to power
>> penalty. So my main idea is to let suspend/resume contexts, blocked by
>> PM cmds,
>> fail fast first and then error handler recover everything back to 
>> work.
> 
> Hi Can,
> 
> Has it been considered to make the UFS error handler fail pending
> commands with an error code that causes the SCSI core to resubmit the
> SCSI command, e.g. DID_IMM_RETRY or DID_TRANSPORT_DISRUPTED? I want to
> prevent that power management or suspend/resume callbacks fail if the
> error handler succeeds with recovering the UFS transport.
> 

Hi Bart,

Thanks for the suggestion, I thought about it but I didn't go that
far in this path because I believe letting a context fast fail is
better than retrying/blocking it (to me suspend/resume can fail
due to many reasons and task abort is just one of them). I appreciate
the idea, but I would like to stick to my way as of now because

1. Merely preventing task abort cannot prevent suspend/resume fail.
Task abort (to PM requests), in real cases, is just one of many kinds
of failure which can fail the suspend/resume callbacks. During
suspend/resume, if AH8 error and/or UIC errors happen, IRQ handler
may complete SSU cmd with errors and schedule the error handler (I've
seen such scenarios in real customer cases). My idea is to treat task
abort (to PM requests) as a failure (let scsi_execute() return with
whatever error) and let error handler recover everything just like
any other UFS errors which invoke error handler. In case this, again,
goes back to the topic that is why don't just do error recovery in
suspend/resume, let me paste my previous reply here -

"
Error handler has the same nature of user access - it is unpredictable, 
meaning it
can be invoked at any time (from IRQ handler), even when there is no 
ongoing
cmd/data transactions (like auto hibern8 failure and UIC errors, such as 
DME
error and some errors in data link layer) [1], unless you disable UFS 
IRQ.

The reasons why I choose not to do it that way are (althrough error 
handler
prepare has became much more simple after apply this change)

- I want to keep all the complexity within error handler, and re-direct 
all error
recovery needs to error handler. It can avoid calling 
ufshcd_reset_and_restore()
and/or flush_work(&hba->eh_work) here and there. The entire UFS 
suspend/resume is
already complex enough, I don't want to mess up with it.

- We do explicit recovery only when we see certain errors, e.g., H8 
enter func
returns an error during suspend, but as mentioned above [1], error 
handling can
be invoked already from IRQ handler (due to all kinds of UIC errors 
before H8 enter
func returns). So, we still need host_sem (in case of system 
suspend/resume) to
avoid concurrency.

- During system suspend/resume, error handling can be invoked (due to 
non-fatal
errors) but still UFS cmds return no error at all. Similar like above, 
we need
host_sem to avoid concurrency.
"

2. And say we want SCSI layer to resubmit PM requests to prevent
suspend/resume fail, we should keep retrying the PM requests (so
long as error handler can recover everything successfully), meaning
we should give them unlimited retries (which I think is a bad idea),
otherwise (if they have zero retries or limited retries), in extreme
conditions, what may happen is that error handler can recover everything
successfully every time, but all these retries (say 3) still time out,
which block the power management for too long (retries * 60 seconds) 
and,
most important, when the last retry times out, scsi layer will anyways
complete the PM request (even we return DID_IMM_RETRY), then we end up
same - suspend/resume shall run concurrently with error handler and we
couldn't recover saved PM errors.

Thanks,

Can Guo.

> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-16  4:00                 ` Can Guo
@ 2021-06-16  4:40                   ` Bart Van Assche
  2021-06-16  8:47                     ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-16  4:40 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/15/21 9:00 PM, Can Guo wrote:
> I would like to stick to my way as of now because
> 
> 1. Merely preventing task abort cannot prevent suspend/resume fail.
> Task abort (to PM requests), in real cases, is just one of many kinds
> of failure which can fail the suspend/resume callbacks. During
> suspend/resume, if AH8 error and/or UIC errors happen, IRQ handler
> may complete SSU cmd with errors and schedule the error handler (I've
> seen such scenarios in real customer cases). My idea is to treat task
> abort (to PM requests) as a failure (let scsi_execute() return with
> whatever error) and let error handler recover everything just like
> any other UFS errors which invoke error handler. In case this, again,
> goes back to the topic that is why don't just do error recovery in
> suspend/resume, let me paste my previous reply here -

Does this mean that the IRQ handler can complete an SSU command with an
error and that the error handler can later recover from that error? That
sounds completely wrong to me. The IRQ handler should never complete any
command with an error if that error could be recoverable. Instead, the
IRQ handler should add that command to a list and leave it to the error
handler to fail that command or to retry it.

> 2. And say we want SCSI layer to resubmit PM requests to prevent
> suspend/resume fail, we should keep retrying the PM requests (so
> long as error handler can recover everything successfully), meaning
> we should give them unlimited retries (which I think is a bad idea),
> otherwise (if they have zero retries or limited retries), in extreme
> conditions, what may happen is that error handler can recover everything
> successfully every time, but all these retries (say 3) still time out,
> which block the power management for too long (retries * 60 seconds) and,
> most important, when the last retry times out, scsi layer will anyways
> complete the PM request (even we return DID_IMM_RETRY), then we end up
> same - suspend/resume shall run concurrently with error handler and we
> couldn't recover saved PM errors.

Hmm ... it is not clear to me why this behavior is considered a problem?

What is wrong with blocking RPM while a START STOP UNIT command is being
processed? If there are UFS devices for which it takes long to process
that command I think it is up to the vendors of these devices to fix
these UFS devices.

Additionally, if a UFS device needs more than (retries * 60 seconds) to
process a START STOP UNIT command, shouldn't it be marked as broken?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-16  4:40                   ` Bart Van Assche
@ 2021-06-16  8:47                     ` Can Guo
  2021-06-16 17:55                       ` Bart Van Assche
  0 siblings, 1 reply; 43+ messages in thread
From: Can Guo @ 2021-06-16  8:47 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Hi Bart,

On 2021-06-16 12:40, Bart Van Assche wrote:
> On 6/15/21 9:00 PM, Can Guo wrote:
>> I would like to stick to my way as of now because
>> 
>> 1. Merely preventing task abort cannot prevent suspend/resume fail.
>> Task abort (to PM requests), in real cases, is just one of many kinds
>> of failure which can fail the suspend/resume callbacks. During
>> suspend/resume, if AH8 error and/or UIC errors happen, IRQ handler
>> may complete SSU cmd with errors and schedule the error handler (I've
>> seen such scenarios in real customer cases). My idea is to treat task
>> abort (to PM requests) as a failure (let scsi_execute() return with
>> whatever error) and let error handler recover everything just like
>> any other UFS errors which invoke error handler. In case this, again,
>> goes back to the topic that is why don't just do error recovery in
>> suspend/resume, let me paste my previous reply here -
> 
> Does this mean that the IRQ handler can complete an SSU command with an
> error and that the error handler can later recover from that error?

Not exactly, sorry that I didn't put it clearly. There are cases where 
cmds
are completed with an error (either OCS is not SUCCESS or device returns
check condition in resp) and accompanied by fatal or non-fatal UIC 
errors
(UIC errors invoke UFS error handler). For example, SSU is completed 
with
OCS_MISMATCH_RESPONSE_UPIU_SIZE (whatever the reason is in HW), then 
auto
hibern8 enter (AH8 timer timeout hba->ahit is set to a very low value) 
kicks
start right after but fails with fatal UIC errors. From dmesg log, these 
all
happen at once. I've seen even more complicated cases where all kinds of 
errors
mess up together.

> That sounds completely wrong to me. The IRQ handler should never 
> complete any
> command with an error if that error could be recoverable. Instead, the
> IRQ handler should add that command to a list and leave it to the error
> handler to fail that command or to retry it.
> 
>> 2. And say we want SCSI layer to resubmit PM requests to prevent
>> suspend/resume fail, we should keep retrying the PM requests (so
>> long as error handler can recover everything successfully), meaning
>> we should give them unlimited retries (which I think is a bad idea),
>> otherwise (if they have zero retries or limited retries), in extreme
>> conditions, what may happen is that error handler can recover 
>> everything
>> successfully every time, but all these retries (say 3) still time out,
>> which block the power management for too long (retries * 60 seconds) 
>> and,
>> most important, when the last retry times out, scsi layer will anyways
>> complete the PM request (even we return DID_IMM_RETRY), then we end up
>> same - suspend/resume shall run concurrently with error handler and we
>> couldn't recover saved PM errors.
> 
> Hmm ... it is not clear to me why this behavior is considered a 
> problem?
> 

To me, task abort to PM requests does not worth being treated so 
differently,
after all suspend/resume may fail due to any kinds of UFS errors (as 
I've
explained so many times). My idea is to let PM requests fast fail (60 
seconds
has passed, a broken device maybe, we have reason to fail it since it is 
just
a passthrough req) and schedule UFS error handler, UFS error handler 
shall
proceed after suspend/resume fails out then start to recover everything 
in a
safe environment. Is this way not working?

Thanks,

Can Guo.

> What is wrong with blocking RPM while a START STOP UNIT command is 
> being
> processed? If there are UFS devices for which it takes long to process
> that command I think it is up to the vendors of these devices to fix
> these UFS devices.
> 
> Additionally, if a UFS device needs more than (retries * 60 seconds) to
> process a START STOP UNIT command, shouldn't it be marked as broken?
> 
> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops
  2021-06-10  4:43 ` [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops Can Guo
  2021-06-10 11:15   ` Adrian Hunter
  2021-06-11 20:40   ` Bart Van Assche
@ 2021-06-16 17:50   ` Bart Van Assche
  2021-06-23  1:32     ` Can Guo
  2 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-16 17:50 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, open list

On 6/9/21 9:43 PM, Can Guo wrote:
> @@ -8784,7 +8786,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  	enum ufs_dev_pwr_mode req_dev_pwr_mode;
>  	enum uic_link_state req_link_state;
>  
> -	hba->pm_op_in_progress = true;
> +	hba->wl_pm_op_in_progress = true;
>  	if (pm_op != UFS_SHUTDOWN_PM) {
>  		pm_lvl = pm_op == UFS_RUNTIME_PM ?
>  			 hba->rpm_lvl : hba->spm_lvl;
> @@ -8919,7 +8921,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op)
>  		hba->clk_gating.is_suspended = false;
>  		ufshcd_release(hba);
>  	}
> -	hba->pm_op_in_progress = false;
> +	hba->wl_pm_op_in_progress = false;
>  	return ret;
>  }

Are the __ufshcd_wl_suspend() calls serialized in any way? If not, will
the value of wl_pm_op_in_progress be incorrect if multiple kernel
threads run __ufshcd_wl_suspend() concurrently and one of the
__ufshcd_wl_suspend() instances returns earlier than the other?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-16  8:47                     ` Can Guo
@ 2021-06-16 17:55                       ` Bart Van Assche
  2021-06-23  1:34                         ` Can Guo
  0 siblings, 1 reply; 43+ messages in thread
From: Bart Van Assche @ 2021-06-16 17:55 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

On 6/16/21 1:47 AM, Can Guo wrote:
> On 2021-06-16 12:40, Bart Van Assche wrote:
>> On 6/15/21 9:00 PM, Can Guo wrote:
>>> 2. And say we want SCSI layer to resubmit PM requests to prevent 
>>> suspend/resume fail, we should keep retrying the PM requests (so 
>>> long as error handler can recover everything successfully),
>>> meaning we should give them unlimited retries (which I think is a
>>> bad idea), otherwise (if they have zero retries or limited
>>> retries), in extreme conditions, what may happen is that error
>>> handler can recover everything successfully every time, but all
>>> these retries (say 3) still time out, which block the power
>>> management for too long (retries * 60 seconds) and, most
>>> important, when the last retry times out, scsi layer will
>>> anyways complete the PM request (even we return DID_IMM_RETRY),
>>> then we end up same - suspend/resume shall run concurrently with
>>> error handler and we couldn't recover saved PM errors.
>> 
>> Hmm ... it is not clear to me why this behavior is considered a
>> problem?
> 
> To me, task abort to PM requests does not worth being treated so 
> differently, after all suspend/resume may fail due to any kinds of
> UFS errors (as I've explained so many times). My idea is to let PM
> requests fast fail (60 seconds has passed, a broken device maybe, we
> have reason to fail it since it is just a passthrough req) and
> schedule UFS error handler, UFS error handler shall proceed after
> suspend/resume fails out then start to recover everything in a safe
> environment. Is this way not working?
Hi Can,

Thank you for the clarification. As you probably know the power
management subsystem serializes runtime power management (RPM) and
system suspend callbacks. I was concerned about the consequences of a
failed RPM transition on system suspend and resume. Having taken a
closer look at the UFS driver, I see that failed RPM transitions do not
require special handling in the system suspend or resume callbacks. In
other words, I'm fine with the approach of failing PM requests fast.

Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops
  2021-06-16 17:50   ` Bart Van Assche
@ 2021-06-23  1:32     ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-23  1:32 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	Adrian Hunter, Kiwoong Kim, Satya Tangirala, open list

Hi Bart,

On 2021-06-17 01:50, Bart Van Assche wrote:
> On 6/9/21 9:43 PM, Can Guo wrote:
>> @@ -8784,7 +8786,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>>  	enum ufs_dev_pwr_mode req_dev_pwr_mode;
>>  	enum uic_link_state req_link_state;
>> 
>> -	hba->pm_op_in_progress = true;
>> +	hba->wl_pm_op_in_progress = true;
>>  	if (pm_op != UFS_SHUTDOWN_PM) {
>>  		pm_lvl = pm_op == UFS_RUNTIME_PM ?
>>  			 hba->rpm_lvl : hba->spm_lvl;
>> @@ -8919,7 +8921,7 @@ static int __ufshcd_wl_suspend(struct ufs_hba 
>> *hba, enum ufs_pm_op pm_op)
>>  		hba->clk_gating.is_suspended = false;
>>  		ufshcd_release(hba);
>>  	}
>> -	hba->pm_op_in_progress = false;
>> +	hba->wl_pm_op_in_progress = false;
>>  	return ret;
>>  }
> 
> Are the __ufshcd_wl_suspend() calls serialized in any way? If not, will
> the value of wl_pm_op_in_progress be incorrect if multiple kernel
> threads run __ufshcd_wl_suspend() concurrently and one of the
> __ufshcd_wl_suspend() instances returns earlier than the other?
> 

Sorry for getting back late on this... I was stuck by some urgent 
issues.

Yes, __ufshcd_wl_suspend() calls are serilized, because it is called by
either runtime suspend or system suspend, and runtime suspend and system
suspend are serialized - Rafael J. Wysocki has put a lot of efforts on 
it,
see also 1e2ef05bb8cf8 ("PM: Limit race conditions between runtime PM 
and
system sleep (v2)")).

> Thanks,
> 
> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests
  2021-06-16 17:55                       ` Bart Van Assche
@ 2021-06-23  1:34                         ` Can Guo
  0 siblings, 0 replies; 43+ messages in thread
From: Can Guo @ 2021-06-23  1:34 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: asutoshd, nguyenb, hongwus, ziqichen, linux-scsi, kernel-team,
	Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Jaegeuk Kim,
	open list

Hi Bart,

On 2021-06-17 01:55, Bart Van Assche wrote:
> On 6/16/21 1:47 AM, Can Guo wrote:
>> On 2021-06-16 12:40, Bart Van Assche wrote:
>>> On 6/15/21 9:00 PM, Can Guo wrote:
>>>> 2. And say we want SCSI layer to resubmit PM requests to prevent
>>>> suspend/resume fail, we should keep retrying the PM requests (so
>>>> long as error handler can recover everything successfully),
>>>> meaning we should give them unlimited retries (which I think is a
>>>> bad idea), otherwise (if they have zero retries or limited
>>>> retries), in extreme conditions, what may happen is that error
>>>> handler can recover everything successfully every time, but all
>>>> these retries (say 3) still time out, which block the power
>>>> management for too long (retries * 60 seconds) and, most
>>>> important, when the last retry times out, scsi layer will
>>>> anyways complete the PM request (even we return DID_IMM_RETRY),
>>>> then we end up same - suspend/resume shall run concurrently with
>>>> error handler and we couldn't recover saved PM errors.
>>> 
>>> Hmm ... it is not clear to me why this behavior is considered a
>>> problem?
>> 
>> To me, task abort to PM requests does not worth being treated so
>> differently, after all suspend/resume may fail due to any kinds of
>> UFS errors (as I've explained so many times). My idea is to let PM
>> requests fast fail (60 seconds has passed, a broken device maybe, we
>> have reason to fail it since it is just a passthrough req) and
>> schedule UFS error handler, UFS error handler shall proceed after
>> suspend/resume fails out then start to recover everything in a safe
>> environment. Is this way not working?
> Hi Can,
> 
> Thank you for the clarification. As you probably know the power
> management subsystem serializes runtime power management (RPM) and
> system suspend callbacks. I was concerned about the consequences of a
> failed RPM transition on system suspend and resume. Having taken a
> closer look at the UFS driver, I see that failed RPM transitions do not
> require special handling in the system suspend or resume callbacks. In
> other words, I'm fine with the approach of failing PM requests fast.
> 

Thank you for your time and efforts spent on this series, I will upload
next version to address your previous comments (hope I can convince 
Trilok
to pick these up).

Thanks,

Can Guo.

> Bart.

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2021-06-23  1:35 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1623300218-9454-1-git-send-email-cang@codeaurora.org>
2021-06-10  4:43 ` [PATCH v3 1/9] scsi: ufs: Differentiate status between hba pm ops and wl pm ops Can Guo
2021-06-10 11:15   ` Adrian Hunter
2021-06-11  0:53     ` Can Guo
2021-06-11 20:40   ` Bart Van Assche
2021-06-12  6:20     ` Can Guo
2021-06-16 17:50   ` Bart Van Assche
2021-06-23  1:32     ` Can Guo
2021-06-10  4:43 ` [PATCH v3 2/9] scsi: ufs: Update the return value of supplier " Can Guo
2021-06-10  4:43 ` [PATCH v3 3/9] scsi: ufs: Enable IRQ after enabling clocks in error handling preparation Can Guo
2021-06-10  4:43 ` [PATCH v3 4/9] scsi: ufs: Complete the cmd before returning in queuecommand Can Guo
2021-06-11 20:52   ` Bart Van Assche
2021-06-12  7:38     ` Can Guo
2021-06-12 15:50       ` Bart Van Assche
2021-06-13 13:30         ` Can Guo
2021-06-10  4:43 ` [PATCH v3 5/9] scsi: ufs: Simplify error handling preparation Can Guo
2021-06-10 12:30   ` Adrian Hunter
2021-06-11  3:01     ` Can Guo
2021-06-11 20:58       ` Bart Van Assche
2021-06-12  6:46         ` Can Guo
2021-06-12  9:49           ` Can Guo
2021-06-10  4:43 ` [PATCH v3 6/9] scsi: ufs: Update ufshcd_recover_pm_error() Can Guo
2021-06-10  4:43 ` [PATCH v3 7/9] scsi: ufs: Let host_sem cover the entire system suspend/resume Can Guo
2021-06-10 13:32   ` Adrian Hunter
2021-06-11  3:06     ` Can Guo
2021-06-11 21:00   ` Bart Van Assche
2021-06-12  6:46     ` Can Guo
2021-06-10  4:43 ` [PATCH v3 8/9] scsi: ufs: Update the fast abort path in ufshcd_abort() for PM requests Can Guo
2021-06-11 21:02   ` Bart Van Assche
2021-06-12  7:07     ` Can Guo
2021-06-12 16:50       ` Bart Van Assche
2021-06-13 14:42         ` Can Guo
2021-06-14 18:49           ` Bart Van Assche
2021-06-15  2:36             ` Can Guo
2021-06-15  3:17               ` Can Guo
2021-06-15 18:25               ` Bart Van Assche
2021-06-16  4:00                 ` Can Guo
2021-06-16  4:40                   ` Bart Van Assche
2021-06-16  8:47                     ` Can Guo
2021-06-16 17:55                       ` Bart Van Assche
2021-06-23  1:34                         ` Can Guo
2021-06-10  4:43 ` [PATCH v3 9/9] scsi: ufs: Apply more limitations to user access Can Guo
2021-06-11 21:03   ` Bart Van Assche
2021-06-12  7:13     ` Can Guo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).