linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V7 1/3] scsi: ufs: Serialize eh_work with system PM events and async scan
       [not found] <1606910644-21185-1-git-send-email-cang@codeaurora.org>
@ 2020-12-02 12:04 ` Can Guo
  2020-12-02 12:04 ` [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work Can Guo
  2020-12-02 12:04 ` [PATCH V7 3/3] scsi: ufs: Print host regs in IRQ handler when AH8 error happens Can Guo
  2 siblings, 0 replies; 9+ messages in thread
From: Can Guo @ 2020-12-02 12:04 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Matthias Brugger, Stanley Chu, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list,
	moderated list:ARM/Mediatek SoC support,
	moderated list:ARM/Mediatek SoC support

Serialize eh_work with system PM events and async scan to make sure eh_work
does not run in parallel with them.

Reviewed-by: Stanley Chu <stanley.chu@mediatek.com>
Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
Reviewed-by: Hongwu Su<hongwus@codeaurora.org>
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 64 +++++++++++++++++++++++++++++------------------
 drivers/scsi/ufs/ufshcd.h |  1 +
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 47c544d..f0bb3fc 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -5597,7 +5597,9 @@ static inline void ufshcd_schedule_eh_work(struct ufs_hba *hba)
 static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 {
 	pm_runtime_get_sync(hba->dev);
-	if (pm_runtime_suspended(hba->dev)) {
+	if (pm_runtime_status_suspended(hba->dev) || hba->is_sys_suspended) {
+		enum ufs_pm_op pm_op;
+
 		/*
 		 * Don't assume anything of pm_runtime_get_sync(), if
 		 * resume fails, irq and clocks can be OFF, and powers
@@ -5612,7 +5614,8 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 		if (!ufshcd_is_clkgating_allowed(hba))
 			ufshcd_setup_clocks(hba, true);
 		ufshcd_release(hba);
-		ufshcd_vops_resume(hba, UFS_RUNTIME_PM);
+		pm_op = hba->is_sys_suspended ? UFS_SYSTEM_PM : UFS_RUNTIME_PM;
+		ufshcd_vops_resume(hba, pm_op);
 	} else {
 		ufshcd_hold(hba, false);
 		if (hba->clk_scaling.is_allowed) {
@@ -5633,7 +5636,7 @@ static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
 
 static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
 {
-	return (hba->ufshcd_state == UFSHCD_STATE_ERROR ||
+	return (!hba->is_powered || hba->ufshcd_state == UFSHCD_STATE_ERROR ||
 		(!(hba->saved_err || hba->saved_uic_err || hba->force_reset ||
 			ufshcd_is_link_broken(hba))));
 }
@@ -5646,6 +5649,7 @@ static void ufshcd_recover_pm_error(struct ufs_hba *hba)
 	struct request_queue *q;
 	int ret;
 
+	hba->is_sys_suspended = false;
 	/*
 	 * Set RPM status of hba device to RPM_ACTIVE,
 	 * this also clears its runtime error.
@@ -5704,11 +5708,13 @@ static void ufshcd_err_handler(struct work_struct *work)
 
 	hba = container_of(work, struct ufs_hba, eh_work);
 
+	down(&hba->eh_sem);
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	if (ufshcd_err_handling_should_stop(hba)) {
 		if (hba->ufshcd_state != UFSHCD_STATE_ERROR)
 			hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
 		spin_unlock_irqrestore(hba->host->host_lock, flags);
+		up(&hba->eh_sem);
 		return;
 	}
 	ufshcd_set_eh_in_progress(hba);
@@ -5716,20 +5722,18 @@ static void ufshcd_err_handler(struct work_struct *work)
 	ufshcd_err_handling_prepare(hba);
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	ufshcd_scsi_block_requests(hba);
-	/*
-	 * A full reset and restore might have happened after preparation
-	 * is finished, double check whether we should stop.
-	 */
-	if (ufshcd_err_handling_should_stop(hba)) {
-		if (hba->ufshcd_state != UFSHCD_STATE_ERROR)
-			hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
-		goto out;
-	}
 	hba->ufshcd_state = UFSHCD_STATE_RESET;
 
 	/* Complete requests that have door-bell cleared by h/w */
 	ufshcd_complete_requests(hba);
 
+	/*
+	 * A full reset and restore might have happened after preparation
+	 * is finished, double check whether we should stop.
+	 */
+	if (ufshcd_err_handling_should_stop(hba))
+		goto skip_err_handling;
+
 	if (hba->dev_quirks & UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS) {
 		bool ret;
 
@@ -5737,17 +5741,10 @@ static void ufshcd_err_handler(struct work_struct *work)
 		/* release the lock as ufshcd_quirk_dl_nac_errors() may sleep */
 		ret = ufshcd_quirk_dl_nac_errors(hba);
 		spin_lock_irqsave(hba->host->host_lock, flags);
-		if (!ret && !hba->force_reset && ufshcd_is_link_active(hba))
+		if (!ret && ufshcd_err_handling_should_stop(hba))
 			goto skip_err_handling;
 	}
 
-	if (hba->force_reset || ufshcd_is_link_broken(hba) ||
-	    ufshcd_is_saved_err_fatal(hba) ||
-	    ((hba->saved_err & UIC_ERROR) &&
-	     (hba->saved_uic_err & (UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
-				    UFSHCD_UIC_DL_TCx_REPLAY_ERROR))))
-		needs_reset = true;
-
 	if ((hba->saved_err & (INT_FATAL_ERRORS | UFSHCD_UIC_HIBERN8_MASK)) ||
 	    (hba->saved_uic_err &&
 	     (hba->saved_uic_err != UFSHCD_UIC_PA_GENERIC_ERROR))) {
@@ -5767,8 +5764,14 @@ static void ufshcd_err_handler(struct work_struct *work)
 	 * transfers forcefully because they will get cleared during
 	 * host reset and restore
 	 */
-	if (needs_reset)
+	if (hba->force_reset || ufshcd_is_link_broken(hba) ||
+	    ufshcd_is_saved_err_fatal(hba) ||
+	    ((hba->saved_err & UIC_ERROR) &&
+	     (hba->saved_uic_err & (UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
+				    UFSHCD_UIC_DL_TCx_REPLAY_ERROR)))) {
+		needs_reset = true;
 		goto do_reset;
+	}
 
 	/*
 	 * If LINERESET was caught, UFS might have been put to PWM mode,
@@ -5876,12 +5879,11 @@ static void ufshcd_err_handler(struct work_struct *work)
 			dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x saved_uic_err 0x%x",
 			    __func__, hba->saved_err, hba->saved_uic_err);
 	}
-
-out:
 	ufshcd_clear_eh_in_progress(hba);
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 	ufshcd_scsi_unblock_requests(hba);
 	ufshcd_err_handling_unprepare(hba);
+	up(&hba->eh_sem);
 }
 
 /**
@@ -6856,6 +6858,7 @@ static int ufshcd_reset_and_restore(struct ufs_hba *hba)
 	 */
 	scsi_report_bus_reset(hba->host, 0);
 	if (err) {
+		hba->ufshcd_state = UFSHCD_STATE_ERROR;
 		hba->saved_err |= saved_err;
 		hba->saved_uic_err |= saved_uic_err;
 	}
@@ -7704,8 +7707,10 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie)
 	struct ufs_hba *hba = (struct ufs_hba *)data;
 	int ret;
 
+	down(&hba->eh_sem);
 	/* Initialize hba, detect and initialize UFS device */
 	ret = ufshcd_probe_hba(hba, true);
+	up(&hba->eh_sem);
 	if (ret)
 		goto out;
 
@@ -8718,6 +8723,7 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
 	int ret = 0;
 	ktime_t start = ktime_get();
 
+	down(&hba->eh_sem);
 	if (!hba || !hba->is_powered)
 		return 0;
 
@@ -8748,6 +8754,8 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
 		hba->is_sys_suspended = true;
+	else
+		up(&hba->eh_sem);
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_suspend);
@@ -8764,8 +8772,10 @@ int ufshcd_system_resume(struct ufs_hba *hba)
 	int ret = 0;
 	ktime_t start = ktime_get();
 
-	if (!hba)
+	if (!hba) {
+		up(&hba->eh_sem);
 		return -EINVAL;
+	}
 
 	if (!hba->is_powered || pm_runtime_suspended(hba->dev))
 		/*
@@ -8781,6 +8791,7 @@ int ufshcd_system_resume(struct ufs_hba *hba)
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
 		hba->is_sys_suspended = false;
+	up(&hba->eh_sem);
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_resume);
@@ -8872,6 +8883,7 @@ int ufshcd_shutdown(struct ufs_hba *hba)
 {
 	int ret = 0;
 
+	down(&hba->eh_sem);
 	if (!hba->is_powered)
 		goto out;
 
@@ -8888,6 +8900,8 @@ int ufshcd_shutdown(struct ufs_hba *hba)
 out:
 	if (ret)
 		dev_err(hba->dev, "%s failed, err %d\n", __func__, ret);
+	hba->is_powered = false;
+	up(&hba->eh_sem);
 	/* allow force shutdown even in case of errors */
 	return 0;
 }
@@ -9082,6 +9096,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	INIT_WORK(&hba->eh_work, ufshcd_err_handler);
 	INIT_WORK(&hba->eeh_work, ufshcd_exception_event_handler);
 
+	sema_init(&hba->eh_sem, 1);
+
 	/* Initialize UIC command mutex */
 	mutex_init(&hba->uic_cmd_mutex);
 
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 47eb143..1e680bf 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -728,6 +728,7 @@ struct ufs_hba {
 	u32 intr_mask;
 	u16 ee_ctrl_mask;
 	bool is_powered;
+	struct semaphore eh_sem;
 
 	/* Work Queues */
 	struct workqueue_struct *eh_wq;
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work
       [not found] <1606910644-21185-1-git-send-email-cang@codeaurora.org>
  2020-12-02 12:04 ` [PATCH V7 1/3] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
@ 2020-12-02 12:04 ` Can Guo
  2020-12-03  2:21   ` Stanley Chu
  2020-12-02 12:04 ` [PATCH V7 3/3] scsi: ufs: Print host regs in IRQ handler when AH8 error happens Can Guo
  2 siblings, 1 reply; 9+ messages in thread
From: Can Guo @ 2020-12-02 12:04 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Bart Van Assche,
	Satya Tangirala, open list

In current task abort routine, if task abort happens to the device W-LU,
the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
full reset and restore then returns FAIL or SUCCESS. Commands sent to the
device W-LU are most likely the SSU cmds sent during UFS PM operations. If
such SSU cmd enters task abort routine, when ufshcd_eh_host_reset_handler()
flushes eh_work, it will get stuck there since err_handler is serialized
with PM operations.

In order to unblock above call path, we merely clean up the lrb taken by
this cmd, queue the eh_work and return SUCCESS. Once the cmd is aborted,
the PM operation which sends out the cmd just errors out, then err_handler
shall be able to proceed with the full reset and restore.

In this scenario, the cmd is aborted even before it is actually cleared by
HW, set the lrb->in_use flag to prevent subsequent cmds, including SCSI
cmds and dev cmds, from taking the lrb released from abort. The flag shall
evetually be cleared in __ufshcd_transfer_req_compl() invoked by the full
reset and restore from err_handler.

Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 60 +++++++++++++++++++++++++++++++++++++----------
 drivers/scsi/ufs/ufshcd.h |  2 ++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index f0bb3fc..26c1fa0 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		(hba->clk_gating.state != CLKS_ON));
 
 	lrbp = &hba->lrb[tag];
+	if (unlikely(lrbp->in_use)) {
+		if (hba->pm_op_in_progress)
+			set_host_byte(cmd, DID_BAD_TARGET);
+		else
+			err = SCSI_MLQUEUE_HOST_BUSY;
+		ufshcd_release(hba);
+		goto out;
+	}
 
 	WARN_ON(lrbp->cmd);
 	lrbp->cmd = cmd;
@@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
 
 	init_completion(&wait);
 	lrbp = &hba->lrb[tag];
+	if (unlikely(lrbp->in_use)) {
+		err = -EBUSY;
+		goto out;
+	}
+
 	WARN_ON(lrbp->cmd);
 	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
 	if (unlikely(err))
@@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
 
 	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
 
+out:
 	ufshcd_add_query_upiu_trace(hba, tag,
 			err ? "query_complete_err" : "query_complete");
 
@@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
 	struct scsi_cmnd *cmd;
 	int result;
 	int index;
+	bool update_scaling = false;
 
 	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
 		lrbp = &hba->lrb[index];
+		lrbp->in_use = false;
 		lrbp->compl_time_stamp = ktime_get();
 		cmd = lrbp->cmd;
 		if (cmd) {
@@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
 			/* Do not touch lrbp after scsi done */
 			cmd->scsi_done(cmd);
 			__ufshcd_release(hba);
+			update_scaling = true;
 		} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
 			lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
 			if (hba->dev_cmd.complete) {
 				ufshcd_add_command_trace(hba, index,
 						"dev_complete");
 				complete(hba->dev_cmd.complete);
+				update_scaling = true;
 			}
 		}
-		if (ufshcd_is_clkscaling_supported(hba))
+		if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
 			hba->clk_scaling.active_reqs--;
 	}
 
@@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 
 	init_completion(&wait);
 	lrbp = &hba->lrb[tag];
-	WARN_ON(lrbp->cmd);
+	if (unlikely(lrbp->in_use)) {
+		err = -EBUSY;
+		goto out;
+	}
 
+	WARN_ON(lrbp->cmd);
 	lrbp->cmd = NULL;
 	lrbp->sense_bufflen = 0;
 	lrbp->sense_buffer = NULL;
@@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 		}
 	}
 
+out:
 	blk_put_request(req);
 out_unlock:
 	up_read(&hba->clk_scaling_lock);
@@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		BUG();
 	}
 
-	/*
-	 * Task abort to the device W-LUN is illegal. When this command
-	 * will fail, due to spec violation, scsi err handling next step
-	 * will be to send LU reset which, again, is a spec violation.
-	 * To avoid these unnecessary/illegal step we skip to the last error
-	 * handling stage: reset and restore.
-	 */
-	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
-		return ufshcd_eh_host_reset_handler(cmd);
-
 	ufshcd_hold(hba, false);
 	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
 	/* If command is already aborted/completed, return SUCCESS */
@@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 	 * to reduce repeated printouts. For other aborted requests only print
 	 * basic details.
 	 */
-	scsi_print_command(hba->lrb[tag].cmd);
+	scsi_print_command(cmd);
 	if (!hba->req_abort_count) {
 		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
 		ufshcd_print_host_regs(hba);
@@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		goto cleanup;
 	}
 
+	/*
+	 * Task abort to the device W-LUN is illegal. When this command
+	 * will fail, due to spec violation, scsi err handling next step
+	 * will be to send LU reset which, again, is a spec violation.
+	 * To avoid these unnecessary/illegal steps, first we clean up
+	 * the lrb taken by this cmd and mark the lrb as in_use, then
+	 * queue the eh_work and bail.
+	 */
+	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
+		spin_lock_irqsave(host->host_lock, flags);
+		if (lrbp->cmd) {
+			__ufshcd_transfer_req_compl(hba, (1UL << tag));
+			__set_bit(tag, &hba->outstanding_reqs);
+			lrbp->in_use = true;
+			hba->force_reset = true;
+			ufshcd_schedule_eh_work(hba);
+		}
+		spin_unlock_irqrestore(host->host_lock, flags);
+		goto out;
+	}
+
 	/* Skip task abort in case previous aborts failed and report failure */
 	if (lrbp->req_abort_skip)
 		err = -EIO;
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 1e680bf..66e5338 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
  * @crypto_key_slot: the key slot to use for inline crypto (-1 if none)
  * @data_unit_num: the data unit number for the first block for inline crypto
  * @req_abort_skip: skip request abort task flag
+ * @in_use: indicates that this lrb is still in use
  */
 struct ufshcd_lrb {
 	struct utp_transfer_req_desc *utr_descriptor_ptr;
@@ -192,6 +193,7 @@ struct ufshcd_lrb {
 #endif
 
 	bool req_abort_skip;
+	bool in_use;
 };
 
 /**
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH V7 3/3] scsi: ufs: Print host regs in IRQ handler when AH8 error happens
       [not found] <1606910644-21185-1-git-send-email-cang@codeaurora.org>
  2020-12-02 12:04 ` [PATCH V7 1/3] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
  2020-12-02 12:04 ` [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work Can Guo
@ 2020-12-02 12:04 ` Can Guo
  2 siblings, 0 replies; 9+ messages in thread
From: Can Guo @ 2020-12-02 12:04 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Matthias Brugger, Stanley Chu, Bean Huo,
	Bart Van Assche, open list,
	moderated list:ARM/Mediatek SoC support,
	moderated list:ARM/Mediatek SoC support

When AH8 error happens, all the regs and states are dumped in err handler.
Sometime we need to look into host regs right after AH8 error happens,
which is before leaving the IRQ handler.

Reviewed-by: Stanley Chu <stanley.chu@mediatek.com>
Reviewed-by: Bao D. Nguyen <nguyenb@codeaurora.org>
Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
Reviewed-by: Hongwu Su<hongwus@codeaurora.org>
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 26c1fa0..4561601 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -6060,7 +6060,8 @@ static irqreturn_t ufshcd_check_errors(struct ufs_hba *hba)
 		hba->saved_uic_err |= hba->uic_error;
 
 		/* dump controller state before resetting */
-		if ((hba->saved_err & (INT_FATAL_ERRORS)) ||
+		if ((hba->saved_err &
+		     (INT_FATAL_ERRORS | UFSHCD_UIC_HIBERN8_MASK)) ||
 		    (hba->saved_uic_err &&
 		     (hba->saved_uic_err != UFSHCD_UIC_PA_GENERIC_ERROR))) {
 			dev_err(hba->dev, "%s: saved_err 0x%x saved_uic_err 0x%x\n",
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work
  2020-12-02 12:04 ` [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work Can Guo
@ 2020-12-03  2:21   ` Stanley Chu
  2020-12-03  4:01     ` Can Guo
  0 siblings, 1 reply; 9+ messages in thread
From: Stanley Chu @ 2020-12-03  2:21 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, Alim Akhtar, Avri Altman,
	James E.J. Bottomley, Martin K. Petersen, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list

On Wed, 2020-12-02 at 04:04 -0800, Can Guo wrote:
> In current task abort routine, if task abort happens to the device W-LU,
> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
> full reset and restore then returns FAIL or SUCCESS. Commands sent to the
> device W-LU are most likely the SSU cmds sent during UFS PM operations. If
> such SSU cmd enters task abort routine, when ufshcd_eh_host_reset_handler()
> flushes eh_work, it will get stuck there since err_handler is serialized
> with PM operations.
> 
> In order to unblock above call path, we merely clean up the lrb taken by
> this cmd, queue the eh_work and return SUCCESS. Once the cmd is aborted,
> the PM operation which sends out the cmd just errors out, then err_handler
> shall be able to proceed with the full reset and restore.
> 
> In this scenario, the cmd is aborted even before it is actually cleared by
> HW, set the lrb->in_use flag to prevent subsequent cmds, including SCSI
> cmds and dev cmds, from taking the lrb released from abort. The flag shall
> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the full
> reset and restore from err_handler.
> 
> Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
> Signed-off-by: Can Guo <cang@codeaurora.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 60 +++++++++++++++++++++++++++++++++++++----------
>  drivers/scsi/ufs/ufshcd.h |  2 ++
>  2 files changed, 49 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index f0bb3fc..26c1fa0 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>  		(hba->clk_gating.state != CLKS_ON));
>  
>  	lrbp = &hba->lrb[tag];
> +	if (unlikely(lrbp->in_use)) {
> +		if (hba->pm_op_in_progress)
> +			set_host_byte(cmd, DID_BAD_TARGET);
> +		else
> +			err = SCSI_MLQUEUE_HOST_BUSY;
> +		ufshcd_release(hba);
> +		goto out;
> +	}
>  
>  	WARN_ON(lrbp->cmd);
>  	lrbp->cmd = cmd;
> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>  
>  	init_completion(&wait);
>  	lrbp = &hba->lrb[tag];
> +	if (unlikely(lrbp->in_use)) {
> +		err = -EBUSY;
> +		goto out;
> +	}
> +
>  	WARN_ON(lrbp->cmd);
>  	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
>  	if (unlikely(err))
> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>  
>  	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
>  
> +out:
>  	ufshcd_add_query_upiu_trace(hba, tag,
>  			err ? "query_complete_err" : "query_complete");
>  
> @@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
>  	struct scsi_cmnd *cmd;
>  	int result;
>  	int index;
> +	bool update_scaling = false;
>  
>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
>  		lrbp = &hba->lrb[index];
> +		lrbp->in_use = false;
>  		lrbp->compl_time_stamp = ktime_get();
>  		cmd = lrbp->cmd;
>  		if (cmd) {
> @@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
>  			/* Do not touch lrbp after scsi done */
>  			cmd->scsi_done(cmd);
>  			__ufshcd_release(hba);
> +			update_scaling = true;
>  		} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
>  			lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
>  			if (hba->dev_cmd.complete) {
>  				ufshcd_add_command_trace(hba, index,
>  						"dev_complete");
>  				complete(hba->dev_cmd.complete);
> +				update_scaling = true;
>  			}
>  		}
> -		if (ufshcd_is_clkscaling_supported(hba))
> +		if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
>  			hba->clk_scaling.active_reqs--;
>  	}
>  
> @@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
>  
>  	init_completion(&wait);
>  	lrbp = &hba->lrb[tag];
> -	WARN_ON(lrbp->cmd);
> +	if (unlikely(lrbp->in_use)) {
> +		err = -EBUSY;
> +		goto out;
> +	}
>  
> +	WARN_ON(lrbp->cmd);
>  	lrbp->cmd = NULL;
>  	lrbp->sense_bufflen = 0;
>  	lrbp->sense_buffer = NULL;
> @@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
>  		}
>  	}
>  
> +out:
>  	blk_put_request(req);
>  out_unlock:
>  	up_read(&hba->clk_scaling_lock);
> @@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>  		BUG();
>  	}
>  
> -	/*
> -	 * Task abort to the device W-LUN is illegal. When this command
> -	 * will fail, due to spec violation, scsi err handling next step
> -	 * will be to send LU reset which, again, is a spec violation.
> -	 * To avoid these unnecessary/illegal step we skip to the last error
> -	 * handling stage: reset and restore.
> -	 */
> -	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
> -		return ufshcd_eh_host_reset_handler(cmd);
> -
>  	ufshcd_hold(hba, false);
>  	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>  	/* If command is already aborted/completed, return SUCCESS */
> @@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>  	 * to reduce repeated printouts. For other aborted requests only print
>  	 * basic details.
>  	 */
> -	scsi_print_command(hba->lrb[tag].cmd);
> +	scsi_print_command(cmd);
>  	if (!hba->req_abort_count) {
>  		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
>  		ufshcd_print_host_regs(hba);
> @@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>  		goto cleanup;
>  	}
>  
> +	/*
> +	 * Task abort to the device W-LUN is illegal. When this command
> +	 * will fail, due to spec violation, scsi err handling next step
> +	 * will be to send LU reset which, again, is a spec violation.
> +	 * To avoid these unnecessary/illegal steps, first we clean up
> +	 * the lrb taken by this cmd and mark the lrb as in_use, then
> +	 * queue the eh_work and bail.
> +	 */
> +	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
> +		spin_lock_irqsave(host->host_lock, flags);
> +		if (lrbp->cmd) {
> +			__ufshcd_transfer_req_compl(hba, (1UL << tag));
> +			__set_bit(tag, &hba->outstanding_reqs);
> +			lrbp->in_use = true;
> +			hba->force_reset = true;
> +			ufshcd_schedule_eh_work(hba);

ufshcd_schedule_eh_work() will set hba->ufshcd_state as
UFSHCD_STATE_EH_SCHEDULED_FATAL. While in this state,
ufshcd_queuecommand() will set_host_byte(DID_BAD_TARGET) which is
similar as what you would like to do in this patch.

Is this enough for avoiding reusing tag issue? Just wonder if
lrpb->in_use flag is really required to be added.

> +		}
> +		spin_unlock_irqrestore(host->host_lock, flags);
> +		goto out;
> +	}
> +
>  	/* Skip task abort in case previous aborts failed and report failure */
>  	if (lrbp->req_abort_skip)
>  		err = -EIO;
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index 1e680bf..66e5338 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
>   * @crypto_key_slot: the key slot to use for inline crypto (-1 if none)
>   * @data_unit_num: the data unit number for the first block for inline crypto
>   * @req_abort_skip: skip request abort task flag
> + * @in_use: indicates that this lrb is still in use
>   */
>  struct ufshcd_lrb {
>  	struct utp_transfer_req_desc *utr_descriptor_ptr;
> @@ -192,6 +193,7 @@ struct ufshcd_lrb {
>  #endif
>  
>  	bool req_abort_skip;
> +	bool in_use;
>  };
>  
>  /**


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work
  2020-12-03  2:21   ` Stanley Chu
@ 2020-12-03  4:01     ` Can Guo
  2020-12-03  4:51       ` Stanley Chu
  0 siblings, 1 reply; 9+ messages in thread
From: Can Guo @ 2020-12-03  4:01 UTC (permalink / raw)
  To: Stanley Chu
  Cc: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, Alim Akhtar, Avri Altman,
	James E.J. Bottomley, Martin K. Petersen, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list

On 2020-12-03 10:21, Stanley Chu wrote:
> On Wed, 2020-12-02 at 04:04 -0800, Can Guo wrote:
>> In current task abort routine, if task abort happens to the device 
>> W-LU,
>> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
>> full reset and restore then returns FAIL or SUCCESS. Commands sent to 
>> the
>> device W-LU are most likely the SSU cmds sent during UFS PM 
>> operations. If
>> such SSU cmd enters task abort routine, when 
>> ufshcd_eh_host_reset_handler()
>> flushes eh_work, it will get stuck there since err_handler is 
>> serialized
>> with PM operations.
>> 
>> In order to unblock above call path, we merely clean up the lrb taken 
>> by
>> this cmd, queue the eh_work and return SUCCESS. Once the cmd is 
>> aborted,
>> the PM operation which sends out the cmd just errors out, then 
>> err_handler
>> shall be able to proceed with the full reset and restore.
>> 
>> In this scenario, the cmd is aborted even before it is actually 
>> cleared by
>> HW, set the lrb->in_use flag to prevent subsequent cmds, including 
>> SCSI
>> cmds and dev cmds, from taking the lrb released from abort. The flag 
>> shall
>> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the 
>> full
>> reset and restore from err_handler.
>> 
>> Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
>> Signed-off-by: Can Guo <cang@codeaurora.org>
>> ---
>>  drivers/scsi/ufs/ufshcd.c | 60 
>> +++++++++++++++++++++++++++++++++++++----------
>>  drivers/scsi/ufs/ufshcd.h |  2 ++
>>  2 files changed, 49 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index f0bb3fc..26c1fa0 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host 
>> *host, struct scsi_cmnd *cmd)
>>  		(hba->clk_gating.state != CLKS_ON));
>> 
>>  	lrbp = &hba->lrb[tag];
>> +	if (unlikely(lrbp->in_use)) {
>> +		if (hba->pm_op_in_progress)
>> +			set_host_byte(cmd, DID_BAD_TARGET);
>> +		else
>> +			err = SCSI_MLQUEUE_HOST_BUSY;
>> +		ufshcd_release(hba);
>> +		goto out;
>> +	}
>> 
>>  	WARN_ON(lrbp->cmd);
>>  	lrbp->cmd = cmd;
>> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba 
>> *hba,
>> 
>>  	init_completion(&wait);
>>  	lrbp = &hba->lrb[tag];
>> +	if (unlikely(lrbp->in_use)) {
>> +		err = -EBUSY;
>> +		goto out;
>> +	}
>> +
>>  	WARN_ON(lrbp->cmd);
>>  	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
>>  	if (unlikely(err))
>> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba 
>> *hba,
>> 
>>  	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
>> 
>> +out:
>>  	ufshcd_add_query_upiu_trace(hba, tag,
>>  			err ? "query_complete_err" : "query_complete");
>> 
>> @@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct 
>> ufs_hba *hba,
>>  	struct scsi_cmnd *cmd;
>>  	int result;
>>  	int index;
>> +	bool update_scaling = false;
>> 
>>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
>>  		lrbp = &hba->lrb[index];
>> +		lrbp->in_use = false;
>>  		lrbp->compl_time_stamp = ktime_get();
>>  		cmd = lrbp->cmd;
>>  		if (cmd) {
>> @@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct 
>> ufs_hba *hba,
>>  			/* Do not touch lrbp after scsi done */
>>  			cmd->scsi_done(cmd);
>>  			__ufshcd_release(hba);
>> +			update_scaling = true;
>>  		} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
>>  			lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
>>  			if (hba->dev_cmd.complete) {
>>  				ufshcd_add_command_trace(hba, index,
>>  						"dev_complete");
>>  				complete(hba->dev_cmd.complete);
>> +				update_scaling = true;
>>  			}
>>  		}
>> -		if (ufshcd_is_clkscaling_supported(hba))
>> +		if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
>>  			hba->clk_scaling.active_reqs--;
>>  	}
>> 
>> @@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct 
>> ufs_hba *hba,
>> 
>>  	init_completion(&wait);
>>  	lrbp = &hba->lrb[tag];
>> -	WARN_ON(lrbp->cmd);
>> +	if (unlikely(lrbp->in_use)) {
>> +		err = -EBUSY;
>> +		goto out;
>> +	}
>> 
>> +	WARN_ON(lrbp->cmd);
>>  	lrbp->cmd = NULL;
>>  	lrbp->sense_bufflen = 0;
>>  	lrbp->sense_buffer = NULL;
>> @@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct 
>> ufs_hba *hba,
>>  		}
>>  	}
>> 
>> +out:
>>  	blk_put_request(req);
>>  out_unlock:
>>  	up_read(&hba->clk_scaling_lock);
>> @@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>>  		BUG();
>>  	}
>> 
>> -	/*
>> -	 * Task abort to the device W-LUN is illegal. When this command
>> -	 * will fail, due to spec violation, scsi err handling next step
>> -	 * will be to send LU reset which, again, is a spec violation.
>> -	 * To avoid these unnecessary/illegal step we skip to the last error
>> -	 * handling stage: reset and restore.
>> -	 */
>> -	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
>> -		return ufshcd_eh_host_reset_handler(cmd);
>> -
>>  	ufshcd_hold(hba, false);
>>  	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>>  	/* If command is already aborted/completed, return SUCCESS */
>> @@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>>  	 * to reduce repeated printouts. For other aborted requests only 
>> print
>>  	 * basic details.
>>  	 */
>> -	scsi_print_command(hba->lrb[tag].cmd);
>> +	scsi_print_command(cmd);
>>  	if (!hba->req_abort_count) {
>>  		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
>>  		ufshcd_print_host_regs(hba);
>> @@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>>  		goto cleanup;
>>  	}
>> 
>> +	/*
>> +	 * Task abort to the device W-LUN is illegal. When this command
>> +	 * will fail, due to spec violation, scsi err handling next step
>> +	 * will be to send LU reset which, again, is a spec violation.
>> +	 * To avoid these unnecessary/illegal steps, first we clean up
>> +	 * the lrb taken by this cmd and mark the lrb as in_use, then
>> +	 * queue the eh_work and bail.
>> +	 */
>> +	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
>> +		spin_lock_irqsave(host->host_lock, flags);
>> +		if (lrbp->cmd) {
>> +			__ufshcd_transfer_req_compl(hba, (1UL << tag));
>> +			__set_bit(tag, &hba->outstanding_reqs);
>> +			lrbp->in_use = true;
>> +			hba->force_reset = true;
>> +			ufshcd_schedule_eh_work(hba);
> 
> ufshcd_schedule_eh_work() will set hba->ufshcd_state as
> UFSHCD_STATE_EH_SCHEDULED_FATAL. While in this state,
> ufshcd_queuecommand() will set_host_byte(DID_BAD_TARGET) which is
> similar as what you would like to do in this patch.
> 
> Is this enough for avoiding reusing tag issue? Just wonder if
> lrpb->in_use flag is really required to be added.

Hi Stanley,

Thanks for the discussion.

To be accurate, it is to prevent lrb from being re-used, not the
tag. Block layer and/or scsi layer can re-use the tag right after
we abort the cmd, but the lrb is empty since we cleared it from
abort path and we need to make sure the lrb stays empty before the
full reset and restore happens. So, in queuecommand path, we have
below checks to prevernt the lrb being re-used. This is before
hba->ufshcd_state checks.

+    if (unlikely(lrbp->in_use)) {
+        if (hba->pm_op_in_progress)
+            set_host_byte(cmd, DID_BAD_TARGET);
+        else
+            err = SCSI_MLQUEUE_HOST_BUSY;
+        ufshcd_release(hba);
+        goto out;
+    }

In above checks, below exception is for the case that a SSU cmd
sent from PM ops is trying to re-use the lrb. In this case, we
should simply let it fail so that PM ops errors out to unblock
error handling (since error handling is serialized with PM ops).

+        if (hba->pm_op_in_progress)
+            set_host_byte(cmd, DID_BAD_TARGET);

Thanks,

Can Guo.

> 
>> +		}
>> +		spin_unlock_irqrestore(host->host_lock, flags);
>> +		goto out;
>> +	}
>> +
>>  	/* Skip task abort in case previous aborts failed and report failure 
>> */
>>  	if (lrbp->req_abort_skip)
>>  		err = -EIO;
>> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
>> index 1e680bf..66e5338 100644
>> --- a/drivers/scsi/ufs/ufshcd.h
>> +++ b/drivers/scsi/ufs/ufshcd.h
>> @@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
>>   * @crypto_key_slot: the key slot to use for inline crypto (-1 if 
>> none)
>>   * @data_unit_num: the data unit number for the first block for 
>> inline crypto
>>   * @req_abort_skip: skip request abort task flag
>> + * @in_use: indicates that this lrb is still in use
>>   */
>>  struct ufshcd_lrb {
>>  	struct utp_transfer_req_desc *utr_descriptor_ptr;
>> @@ -192,6 +193,7 @@ struct ufshcd_lrb {
>>  #endif
>> 
>>  	bool req_abort_skip;
>> +	bool in_use;
>>  };
>> 
>>  /**

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work
  2020-12-03  4:01     ` Can Guo
@ 2020-12-03  4:51       ` Stanley Chu
  2020-12-03  5:11         ` Can Guo
  0 siblings, 1 reply; 9+ messages in thread
From: Stanley Chu @ 2020-12-03  4:51 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, Alim Akhtar, Avri Altman,
	James E.J. Bottomley, Martin K. Petersen, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list

On Thu, 2020-12-03 at 12:01 +0800, Can Guo wrote:
> On 2020-12-03 10:21, Stanley Chu wrote:
> > On Wed, 2020-12-02 at 04:04 -0800, Can Guo wrote:
> >> In current task abort routine, if task abort happens to the device 
> >> W-LU,
> >> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
> >> full reset and restore then returns FAIL or SUCCESS. Commands sent to 
> >> the
> >> device W-LU are most likely the SSU cmds sent during UFS PM 
> >> operations. If
> >> such SSU cmd enters task abort routine, when 
> >> ufshcd_eh_host_reset_handler()
> >> flushes eh_work, it will get stuck there since err_handler is 
> >> serialized
> >> with PM operations.
> >> 
> >> In order to unblock above call path, we merely clean up the lrb taken 
> >> by
> >> this cmd, queue the eh_work and return SUCCESS. Once the cmd is 
> >> aborted,
> >> the PM operation which sends out the cmd just errors out, then 
> >> err_handler
> >> shall be able to proceed with the full reset and restore.
> >> 
> >> In this scenario, the cmd is aborted even before it is actually 
> >> cleared by
> >> HW, set the lrb->in_use flag to prevent subsequent cmds, including 
> >> SCSI
> >> cmds and dev cmds, from taking the lrb released from abort. The flag 
> >> shall
> >> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the 
> >> full
> >> reset and restore from err_handler.
> >> 
> >> Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
> >> Signed-off-by: Can Guo <cang@codeaurora.org>
> >> ---
> >>  drivers/scsi/ufs/ufshcd.c | 60 
> >> +++++++++++++++++++++++++++++++++++++----------
> >>  drivers/scsi/ufs/ufshcd.h |  2 ++
> >>  2 files changed, 49 insertions(+), 13 deletions(-)
> >> 
> >> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> >> index f0bb3fc..26c1fa0 100644
> >> --- a/drivers/scsi/ufs/ufshcd.c
> >> +++ b/drivers/scsi/ufs/ufshcd.c
> >> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host 
> >> *host, struct scsi_cmnd *cmd)
> >>  		(hba->clk_gating.state != CLKS_ON));
> >> 
> >>  	lrbp = &hba->lrb[tag];
> >> +	if (unlikely(lrbp->in_use)) {
> >> +		if (hba->pm_op_in_progress)
> >> +			set_host_byte(cmd, DID_BAD_TARGET);
> >> +		else
> >> +			err = SCSI_MLQUEUE_HOST_BUSY;
> >> +		ufshcd_release(hba);
> >> +		goto out;
> >> +	}
> >> 
> >>  	WARN_ON(lrbp->cmd);
> >>  	lrbp->cmd = cmd;
> >> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba 
> >> *hba,
> >> 
> >>  	init_completion(&wait);
> >>  	lrbp = &hba->lrb[tag];
> >> +	if (unlikely(lrbp->in_use)) {
> >> +		err = -EBUSY;
> >> +		goto out;
> >> +	}
> >> +
> >>  	WARN_ON(lrbp->cmd);
> >>  	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
> >>  	if (unlikely(err))
> >> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba 
> >> *hba,
> >> 
> >>  	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
> >> 
> >> +out:
> >>  	ufshcd_add_query_upiu_trace(hba, tag,
> >>  			err ? "query_complete_err" : "query_complete");
> >> 
> >> @@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct 
> >> ufs_hba *hba,
> >>  	struct scsi_cmnd *cmd;
> >>  	int result;
> >>  	int index;
> >> +	bool update_scaling = false;
> >> 
> >>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
> >>  		lrbp = &hba->lrb[index];
> >> +		lrbp->in_use = false;
> >>  		lrbp->compl_time_stamp = ktime_get();
> >>  		cmd = lrbp->cmd;
> >>  		if (cmd) {
> >> @@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct 
> >> ufs_hba *hba,
> >>  			/* Do not touch lrbp after scsi done */
> >>  			cmd->scsi_done(cmd);
> >>  			__ufshcd_release(hba);
> >> +			update_scaling = true;
> >>  		} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
> >>  			lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
> >>  			if (hba->dev_cmd.complete) {
> >>  				ufshcd_add_command_trace(hba, index,
> >>  						"dev_complete");
> >>  				complete(hba->dev_cmd.complete);
> >> +				update_scaling = true;
> >>  			}
> >>  		}
> >> -		if (ufshcd_is_clkscaling_supported(hba))
> >> +		if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
> >>  			hba->clk_scaling.active_reqs--;
> >>  	}
> >> 
> >> @@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct 
> >> ufs_hba *hba,
> >> 
> >>  	init_completion(&wait);
> >>  	lrbp = &hba->lrb[tag];
> >> -	WARN_ON(lrbp->cmd);
> >> +	if (unlikely(lrbp->in_use)) {
> >> +		err = -EBUSY;
> >> +		goto out;
> >> +	}
> >> 
> >> +	WARN_ON(lrbp->cmd);
> >>  	lrbp->cmd = NULL;
> >>  	lrbp->sense_bufflen = 0;
> >>  	lrbp->sense_buffer = NULL;
> >> @@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct 
> >> ufs_hba *hba,
> >>  		}
> >>  	}
> >> 
> >> +out:
> >>  	blk_put_request(req);
> >>  out_unlock:
> >>  	up_read(&hba->clk_scaling_lock);
> >> @@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >>  		BUG();
> >>  	}
> >> 
> >> -	/*
> >> -	 * Task abort to the device W-LUN is illegal. When this command
> >> -	 * will fail, due to spec violation, scsi err handling next step
> >> -	 * will be to send LU reset which, again, is a spec violation.
> >> -	 * To avoid these unnecessary/illegal step we skip to the last error
> >> -	 * handling stage: reset and restore.
> >> -	 */
> >> -	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
> >> -		return ufshcd_eh_host_reset_handler(cmd);
> >> -
> >>  	ufshcd_hold(hba, false);
> >>  	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
> >>  	/* If command is already aborted/completed, return SUCCESS */
> >> @@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >>  	 * to reduce repeated printouts. For other aborted requests only 
> >> print
> >>  	 * basic details.
> >>  	 */
> >> -	scsi_print_command(hba->lrb[tag].cmd);
> >> +	scsi_print_command(cmd);
> >>  	if (!hba->req_abort_count) {
> >>  		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
> >>  		ufshcd_print_host_regs(hba);
> >> @@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >>  		goto cleanup;
> >>  	}
> >> 
> >> +	/*
> >> +	 * Task abort to the device W-LUN is illegal. When this command
> >> +	 * will fail, due to spec violation, scsi err handling next step
> >> +	 * will be to send LU reset which, again, is a spec violation.
> >> +	 * To avoid these unnecessary/illegal steps, first we clean up
> >> +	 * the lrb taken by this cmd and mark the lrb as in_use, then
> >> +	 * queue the eh_work and bail.
> >> +	 */
> >> +	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
> >> +		spin_lock_irqsave(host->host_lock, flags);
> >> +		if (lrbp->cmd) {
> >> +			__ufshcd_transfer_req_compl(hba, (1UL << tag));
> >> +			__set_bit(tag, &hba->outstanding_reqs);
> >> +			lrbp->in_use = true;
> >> +			hba->force_reset = true;
> >> +			ufshcd_schedule_eh_work(hba);
> > 
> > ufshcd_schedule_eh_work() will set hba->ufshcd_state as
> > UFSHCD_STATE_EH_SCHEDULED_FATAL. While in this state,
> > ufshcd_queuecommand() will set_host_byte(DID_BAD_TARGET) which is
> > similar as what you would like to do in this patch.
> > 
> > Is this enough for avoiding reusing tag issue? Just wonder if
> > lrpb->in_use flag is really required to be added.
> 
> Hi Stanley,
> 
> Thanks for the discussion.
> 
> To be accurate, it is to prevent lrb from being re-used, not the
> tag.
> Block layer and/or scsi layer can re-use the tag right after
> we abort the cmd, but the lrb is empty since we cleared it from
> abort path and we need to make sure the lrb stays empty before the
> full reset and restore happens.

What is the definition of "empty" here?

If it means lrb->cmd shall be empty (to not invoking scsi_done again),
then the hba->ufshcd_state check in ufshcd_queuecommend() will also
clear the re-used lrb->cmd if ufshcd_state is in
UFSHCD_STATE_EH_SCHEDULED_FATAL case.

However ufshcd_state cannot protect other paths now, for example,
ufshcd_exec_dev_cmd(), so lrbp->in_use may be required for this usage,
or ufshcd_state check can be added to help.

BTW, would you also need to consider ufshcd_issue_devman_upiu_cmd() that
is another possible path to re-use lrb?

Thanks,
Stanley Chu

> So, in queuecommand path, we have
> below checks to prevernt the lrb being re-used. This is before
> hba->ufshcd_state checks.
> 
> +    if (unlikely(lrbp->in_use)) {
> +        if (hba->pm_op_in_progress)
> +            set_host_byte(cmd, DID_BAD_TARGET);
> +        else
> +            err = SCSI_MLQUEUE_HOST_BUSY;
> +        ufshcd_release(hba);
> +        goto out;
> +    }
> 
> In above checks, below exception is for the case that a SSU cmd
> sent from PM ops is trying to re-use the lrb. In this case, we
> should simply let it fail so that PM ops errors out to unblock
> error handling (since error handling is serialized with PM ops).
> 
> +        if (hba->pm_op_in_progress)
> +            set_host_byte(cmd, DID_BAD_TARGET);
> 
> Thanks,
> 
> Can Guo.
> 
> > 
> >> +		}
> >> +		spin_unlock_irqrestore(host->host_lock, flags);
> >> +		goto out;
> >> +	}
> >> +
> >>  	/* Skip task abort in case previous aborts failed and report failure 
> >> */
> >>  	if (lrbp->req_abort_skip)
> >>  		err = -EIO;
> >> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> >> index 1e680bf..66e5338 100644
> >> --- a/drivers/scsi/ufs/ufshcd.h
> >> +++ b/drivers/scsi/ufs/ufshcd.h
> >> @@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
> >>   * @crypto_key_slot: the key slot to use for inline crypto (-1 if 
> >> none)
> >>   * @data_unit_num: the data unit number for the first block for 
> >> inline crypto
> >>   * @req_abort_skip: skip request abort task flag
> >> + * @in_use: indicates that this lrb is still in use
> >>   */
> >>  struct ufshcd_lrb {
> >>  	struct utp_transfer_req_desc *utr_descriptor_ptr;
> >> @@ -192,6 +193,7 @@ struct ufshcd_lrb {
> >>  #endif
> >> 
> >>  	bool req_abort_skip;
> >> +	bool in_use;
> >>  };
> >> 
> >>  /**


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work
  2020-12-03  4:51       ` Stanley Chu
@ 2020-12-03  5:11         ` Can Guo
  2020-12-03  5:19           ` Stanley Chu
  0 siblings, 1 reply; 9+ messages in thread
From: Can Guo @ 2020-12-03  5:11 UTC (permalink / raw)
  To: Stanley Chu
  Cc: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, Alim Akhtar, Avri Altman,
	James E.J. Bottomley, Martin K. Petersen, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list

On 2020-12-03 12:51, Stanley Chu wrote:
> On Thu, 2020-12-03 at 12:01 +0800, Can Guo wrote:
>> On 2020-12-03 10:21, Stanley Chu wrote:
>> > On Wed, 2020-12-02 at 04:04 -0800, Can Guo wrote:
>> >> In current task abort routine, if task abort happens to the device
>> >> W-LU,
>> >> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
>> >> full reset and restore then returns FAIL or SUCCESS. Commands sent to
>> >> the
>> >> device W-LU are most likely the SSU cmds sent during UFS PM
>> >> operations. If
>> >> such SSU cmd enters task abort routine, when
>> >> ufshcd_eh_host_reset_handler()
>> >> flushes eh_work, it will get stuck there since err_handler is
>> >> serialized
>> >> with PM operations.
>> >>
>> >> In order to unblock above call path, we merely clean up the lrb taken
>> >> by
>> >> this cmd, queue the eh_work and return SUCCESS. Once the cmd is
>> >> aborted,
>> >> the PM operation which sends out the cmd just errors out, then
>> >> err_handler
>> >> shall be able to proceed with the full reset and restore.
>> >>
>> >> In this scenario, the cmd is aborted even before it is actually
>> >> cleared by
>> >> HW, set the lrb->in_use flag to prevent subsequent cmds, including
>> >> SCSI
>> >> cmds and dev cmds, from taking the lrb released from abort. The flag
>> >> shall
>> >> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the
>> >> full
>> >> reset and restore from err_handler.
>> >>
>> >> Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
>> >> Signed-off-by: Can Guo <cang@codeaurora.org>
>> >> ---
>> >>  drivers/scsi/ufs/ufshcd.c | 60
>> >> +++++++++++++++++++++++++++++++++++++----------
>> >>  drivers/scsi/ufs/ufshcd.h |  2 ++
>> >>  2 files changed, 49 insertions(+), 13 deletions(-)
>> >>
>> >> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> >> index f0bb3fc..26c1fa0 100644
>> >> --- a/drivers/scsi/ufs/ufshcd.c
>> >> +++ b/drivers/scsi/ufs/ufshcd.c
>> >> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host
>> >> *host, struct scsi_cmnd *cmd)
>> >>  		(hba->clk_gating.state != CLKS_ON));
>> >>
>> >>  	lrbp = &hba->lrb[tag];
>> >> +	if (unlikely(lrbp->in_use)) {
>> >> +		if (hba->pm_op_in_progress)
>> >> +			set_host_byte(cmd, DID_BAD_TARGET);
>> >> +		else
>> >> +			err = SCSI_MLQUEUE_HOST_BUSY;
>> >> +		ufshcd_release(hba);
>> >> +		goto out;
>> >> +	}
>> >>
>> >>  	WARN_ON(lrbp->cmd);
>> >>  	lrbp->cmd = cmd;
>> >> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba
>> >> *hba,
>> >>
>> >>  	init_completion(&wait);
>> >>  	lrbp = &hba->lrb[tag];
>> >> +	if (unlikely(lrbp->in_use)) {
>> >> +		err = -EBUSY;
>> >> +		goto out;
>> >> +	}
>> >> +
>> >>  	WARN_ON(lrbp->cmd);
>> >>  	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
>> >>  	if (unlikely(err))
>> >> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba
>> >> *hba,
>> >>
>> >>  	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
>> >>
>> >> +out:
>> >>  	ufshcd_add_query_upiu_trace(hba, tag,
>> >>  			err ? "query_complete_err" : "query_complete");
>> >>
>> >> @@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct
>> >> ufs_hba *hba,
>> >>  	struct scsi_cmnd *cmd;
>> >>  	int result;
>> >>  	int index;
>> >> +	bool update_scaling = false;
>> >>
>> >>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
>> >>  		lrbp = &hba->lrb[index];
>> >> +		lrbp->in_use = false;
>> >>  		lrbp->compl_time_stamp = ktime_get();
>> >>  		cmd = lrbp->cmd;
>> >>  		if (cmd) {
>> >> @@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct
>> >> ufs_hba *hba,
>> >>  			/* Do not touch lrbp after scsi done */
>> >>  			cmd->scsi_done(cmd);
>> >>  			__ufshcd_release(hba);
>> >> +			update_scaling = true;
>> >>  		} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
>> >>  			lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
>> >>  			if (hba->dev_cmd.complete) {
>> >>  				ufshcd_add_command_trace(hba, index,
>> >>  						"dev_complete");
>> >>  				complete(hba->dev_cmd.complete);
>> >> +				update_scaling = true;
>> >>  			}
>> >>  		}
>> >> -		if (ufshcd_is_clkscaling_supported(hba))
>> >> +		if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
>> >>  			hba->clk_scaling.active_reqs--;
>> >>  	}
>> >>
>> >> @@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct
>> >> ufs_hba *hba,
>> >>
>> >>  	init_completion(&wait);
>> >>  	lrbp = &hba->lrb[tag];
>> >> -	WARN_ON(lrbp->cmd);
>> >> +	if (unlikely(lrbp->in_use)) {
>> >> +		err = -EBUSY;
>> >> +		goto out;
>> >> +	}
>> >>
>> >> +	WARN_ON(lrbp->cmd);
>> >>  	lrbp->cmd = NULL;
>> >>  	lrbp->sense_bufflen = 0;
>> >>  	lrbp->sense_buffer = NULL;
>> >> @@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct
>> >> ufs_hba *hba,
>> >>  		}
>> >>  	}
>> >>
>> >> +out:
>> >>  	blk_put_request(req);
>> >>  out_unlock:
>> >>  	up_read(&hba->clk_scaling_lock);
>> >> @@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>> >>  		BUG();
>> >>  	}
>> >>
>> >> -	/*
>> >> -	 * Task abort to the device W-LUN is illegal. When this command
>> >> -	 * will fail, due to spec violation, scsi err handling next step
>> >> -	 * will be to send LU reset which, again, is a spec violation.
>> >> -	 * To avoid these unnecessary/illegal step we skip to the last error
>> >> -	 * handling stage: reset and restore.
>> >> -	 */
>> >> -	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
>> >> -		return ufshcd_eh_host_reset_handler(cmd);
>> >> -
>> >>  	ufshcd_hold(hba, false);
>> >>  	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>> >>  	/* If command is already aborted/completed, return SUCCESS */
>> >> @@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>> >>  	 * to reduce repeated printouts. For other aborted requests only
>> >> print
>> >>  	 * basic details.
>> >>  	 */
>> >> -	scsi_print_command(hba->lrb[tag].cmd);
>> >> +	scsi_print_command(cmd);
>> >>  	if (!hba->req_abort_count) {
>> >>  		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
>> >>  		ufshcd_print_host_regs(hba);
>> >> @@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>> >>  		goto cleanup;
>> >>  	}
>> >>
>> >> +	/*
>> >> +	 * Task abort to the device W-LUN is illegal. When this command
>> >> +	 * will fail, due to spec violation, scsi err handling next step
>> >> +	 * will be to send LU reset which, again, is a spec violation.
>> >> +	 * To avoid these unnecessary/illegal steps, first we clean up
>> >> +	 * the lrb taken by this cmd and mark the lrb as in_use, then
>> >> +	 * queue the eh_work and bail.
>> >> +	 */
>> >> +	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
>> >> +		spin_lock_irqsave(host->host_lock, flags);
>> >> +		if (lrbp->cmd) {
>> >> +			__ufshcd_transfer_req_compl(hba, (1UL << tag));
>> >> +			__set_bit(tag, &hba->outstanding_reqs);
>> >> +			lrbp->in_use = true;
>> >> +			hba->force_reset = true;
>> >> +			ufshcd_schedule_eh_work(hba);
>> >
>> > ufshcd_schedule_eh_work() will set hba->ufshcd_state as
>> > UFSHCD_STATE_EH_SCHEDULED_FATAL. While in this state,
>> > ufshcd_queuecommand() will set_host_byte(DID_BAD_TARGET) which is
>> > similar as what you would like to do in this patch.
>> >
>> > Is this enough for avoiding reusing tag issue? Just wonder if
>> > lrpb->in_use flag is really required to be added.
>> 
>> Hi Stanley,
>> 
>> Thanks for the discussion.
>> 
>> To be accurate, it is to prevent lrb from being re-used, not the
>> tag.
>> Block layer and/or scsi layer can re-use the tag right after
>> we abort the cmd, but the lrb is empty since we cleared it from
>> abort path and we need to make sure the lrb stays empty before the
>> full reset and restore happens.
> 
> What is the definition of "empty" here?

Yes, it means lrb->cmd is NULL.

> 
> If it means lrb->cmd shall be empty (to not invoking scsi_done again),
> then the hba->ufshcd_state check in ufshcd_queuecommend() will also
> clear the re-used lrb->cmd if ufshcd_state is in
> UFSHCD_STATE_EH_SCHEDULED_FATAL case.

Yes, but that would have a race condition - say a cmd re-uses the tag 
and
occupies the lrb in queuecommand, at the same time the real completion 
IRQ
of previous cmd comes (althrough we aborted it due to it timed out, but 
the
completion IRQ can come at any time before the full reset and restore is
performed), then the new cmd will be wrongly completed by IRQ handler 
from
__ufshcd_transfer_req_compl() which is actually fired by the last cmd, 
so
we need to block it even before the cmd takes the lrb - before

lrbp->cmd = cmd;

> 
> However ufshcd_state cannot protect other paths now, for example,
> ufshcd_exec_dev_cmd(), so lrbp->in_use may be required for this usage,
> or ufshcd_state check can be added to help.
> 
> BTW, would you also need to consider ufshcd_issue_devman_upiu_cmd() 
> that
> is another possible path to re-use lrb?

In this change, there are checks of lrb->in_use in 
ufshcd_queuecommand(),
ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd().

Thanks,

Can Guo.

> 
> Thanks,
> Stanley Chu
> 
>> So, in queuecommand path, we have
>> below checks to prevernt the lrb being re-used. This is before
>> hba->ufshcd_state checks.
>> 
>> +    if (unlikely(lrbp->in_use)) {
>> +        if (hba->pm_op_in_progress)
>> +            set_host_byte(cmd, DID_BAD_TARGET);
>> +        else
>> +            err = SCSI_MLQUEUE_HOST_BUSY;
>> +        ufshcd_release(hba);
>> +        goto out;
>> +    }
>> 
>> In above checks, below exception is for the case that a SSU cmd
>> sent from PM ops is trying to re-use the lrb. In this case, we
>> should simply let it fail so that PM ops errors out to unblock
>> error handling (since error handling is serialized with PM ops).
>> 
>> +        if (hba->pm_op_in_progress)
>> +            set_host_byte(cmd, DID_BAD_TARGET);
>> 
>> Thanks,
>> 
>> Can Guo.
>> 
>> >
>> >> +		}
>> >> +		spin_unlock_irqrestore(host->host_lock, flags);
>> >> +		goto out;
>> >> +	}
>> >> +
>> >>  	/* Skip task abort in case previous aborts failed and report failure
>> >> */
>> >>  	if (lrbp->req_abort_skip)
>> >>  		err = -EIO;
>> >> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
>> >> index 1e680bf..66e5338 100644
>> >> --- a/drivers/scsi/ufs/ufshcd.h
>> >> +++ b/drivers/scsi/ufs/ufshcd.h
>> >> @@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
>> >>   * @crypto_key_slot: the key slot to use for inline crypto (-1 if
>> >> none)
>> >>   * @data_unit_num: the data unit number for the first block for
>> >> inline crypto
>> >>   * @req_abort_skip: skip request abort task flag
>> >> + * @in_use: indicates that this lrb is still in use
>> >>   */
>> >>  struct ufshcd_lrb {
>> >>  	struct utp_transfer_req_desc *utr_descriptor_ptr;
>> >> @@ -192,6 +193,7 @@ struct ufshcd_lrb {
>> >>  #endif
>> >>
>> >>  	bool req_abort_skip;
>> >> +	bool in_use;
>> >>  };
>> >>
>> >>  /**

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work
  2020-12-03  5:11         ` Can Guo
@ 2020-12-03  5:19           ` Stanley Chu
  2020-12-03  5:21             ` Can Guo
  0 siblings, 1 reply; 9+ messages in thread
From: Stanley Chu @ 2020-12-03  5:19 UTC (permalink / raw)
  To: Can Guo
  Cc: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, Alim Akhtar, Avri Altman,
	James E.J. Bottomley, Martin K. Petersen, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list

On Thu, 2020-12-03 at 13:11 +0800, Can Guo wrote:
> On 2020-12-03 12:51, Stanley Chu wrote:
> > On Thu, 2020-12-03 at 12:01 +0800, Can Guo wrote:
> >> On 2020-12-03 10:21, Stanley Chu wrote:
> >> > On Wed, 2020-12-02 at 04:04 -0800, Can Guo wrote:
> >> >> In current task abort routine, if task abort happens to the device
> >> >> W-LU,
> >> >> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
> >> >> full reset and restore then returns FAIL or SUCCESS. Commands sent to
> >> >> the
> >> >> device W-LU are most likely the SSU cmds sent during UFS PM
> >> >> operations. If
> >> >> such SSU cmd enters task abort routine, when
> >> >> ufshcd_eh_host_reset_handler()
> >> >> flushes eh_work, it will get stuck there since err_handler is
> >> >> serialized
> >> >> with PM operations.
> >> >>
> >> >> In order to unblock above call path, we merely clean up the lrb taken
> >> >> by
> >> >> this cmd, queue the eh_work and return SUCCESS. Once the cmd is
> >> >> aborted,
> >> >> the PM operation which sends out the cmd just errors out, then
> >> >> err_handler
> >> >> shall be able to proceed with the full reset and restore.
> >> >>
> >> >> In this scenario, the cmd is aborted even before it is actually
> >> >> cleared by
> >> >> HW, set the lrb->in_use flag to prevent subsequent cmds, including
> >> >> SCSI
> >> >> cmds and dev cmds, from taking the lrb released from abort. The flag
> >> >> shall
> >> >> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the
> >> >> full
> >> >> reset and restore from err_handler.
> >> >>
> >> >> Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
> >> >> Signed-off-by: Can Guo <cang@codeaurora.org>
> >> >> ---
> >> >>  drivers/scsi/ufs/ufshcd.c | 60
> >> >> +++++++++++++++++++++++++++++++++++++----------
> >> >>  drivers/scsi/ufs/ufshcd.h |  2 ++
> >> >>  2 files changed, 49 insertions(+), 13 deletions(-)
> >> >>
> >> >> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> >> >> index f0bb3fc..26c1fa0 100644
> >> >> --- a/drivers/scsi/ufs/ufshcd.c
> >> >> +++ b/drivers/scsi/ufs/ufshcd.c
> >> >> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host
> >> >> *host, struct scsi_cmnd *cmd)
> >> >>  		(hba->clk_gating.state != CLKS_ON));
> >> >>
> >> >>  	lrbp = &hba->lrb[tag];
> >> >> +	if (unlikely(lrbp->in_use)) {
> >> >> +		if (hba->pm_op_in_progress)
> >> >> +			set_host_byte(cmd, DID_BAD_TARGET);
> >> >> +		else
> >> >> +			err = SCSI_MLQUEUE_HOST_BUSY;
> >> >> +		ufshcd_release(hba);
> >> >> +		goto out;
> >> >> +	}
> >> >>
> >> >>  	WARN_ON(lrbp->cmd);
> >> >>  	lrbp->cmd = cmd;
> >> >> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba
> >> >> *hba,
> >> >>
> >> >>  	init_completion(&wait);
> >> >>  	lrbp = &hba->lrb[tag];
> >> >> +	if (unlikely(lrbp->in_use)) {
> >> >> +		err = -EBUSY;
> >> >> +		goto out;
> >> >> +	}
> >> >> +
> >> >>  	WARN_ON(lrbp->cmd);
> >> >>  	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
> >> >>  	if (unlikely(err))
> >> >> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba
> >> >> *hba,
> >> >>
> >> >>  	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
> >> >>
> >> >> +out:
> >> >>  	ufshcd_add_query_upiu_trace(hba, tag,
> >> >>  			err ? "query_complete_err" : "query_complete");
> >> >>
> >> >> @@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct
> >> >> ufs_hba *hba,
> >> >>  	struct scsi_cmnd *cmd;
> >> >>  	int result;
> >> >>  	int index;
> >> >> +	bool update_scaling = false;
> >> >>
> >> >>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
> >> >>  		lrbp = &hba->lrb[index];
> >> >> +		lrbp->in_use = false;
> >> >>  		lrbp->compl_time_stamp = ktime_get();
> >> >>  		cmd = lrbp->cmd;
> >> >>  		if (cmd) {
> >> >> @@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct
> >> >> ufs_hba *hba,
> >> >>  			/* Do not touch lrbp after scsi done */
> >> >>  			cmd->scsi_done(cmd);
> >> >>  			__ufshcd_release(hba);
> >> >> +			update_scaling = true;
> >> >>  		} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
> >> >>  			lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
> >> >>  			if (hba->dev_cmd.complete) {
> >> >>  				ufshcd_add_command_trace(hba, index,
> >> >>  						"dev_complete");
> >> >>  				complete(hba->dev_cmd.complete);
> >> >> +				update_scaling = true;
> >> >>  			}
> >> >>  		}
> >> >> -		if (ufshcd_is_clkscaling_supported(hba))
> >> >> +		if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
> >> >>  			hba->clk_scaling.active_reqs--;
> >> >>  	}
> >> >>
> >> >> @@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct
> >> >> ufs_hba *hba,
> >> >>
> >> >>  	init_completion(&wait);
> >> >>  	lrbp = &hba->lrb[tag];
> >> >> -	WARN_ON(lrbp->cmd);
> >> >> +	if (unlikely(lrbp->in_use)) {
> >> >> +		err = -EBUSY;
> >> >> +		goto out;
> >> >> +	}
> >> >>
> >> >> +	WARN_ON(lrbp->cmd);
> >> >>  	lrbp->cmd = NULL;
> >> >>  	lrbp->sense_bufflen = 0;
> >> >>  	lrbp->sense_buffer = NULL;
> >> >> @@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct
> >> >> ufs_hba *hba,
> >> >>  		}
> >> >>  	}
> >> >>
> >> >> +out:
> >> >>  	blk_put_request(req);
> >> >>  out_unlock:
> >> >>  	up_read(&hba->clk_scaling_lock);
> >> >> @@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >> >>  		BUG();
> >> >>  	}
> >> >>
> >> >> -	/*
> >> >> -	 * Task abort to the device W-LUN is illegal. When this command
> >> >> -	 * will fail, due to spec violation, scsi err handling next step
> >> >> -	 * will be to send LU reset which, again, is a spec violation.
> >> >> -	 * To avoid these unnecessary/illegal step we skip to the last error
> >> >> -	 * handling stage: reset and restore.
> >> >> -	 */
> >> >> -	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
> >> >> -		return ufshcd_eh_host_reset_handler(cmd);
> >> >> -
> >> >>  	ufshcd_hold(hba, false);
> >> >>  	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
> >> >>  	/* If command is already aborted/completed, return SUCCESS */
> >> >> @@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >> >>  	 * to reduce repeated printouts. For other aborted requests only
> >> >> print
> >> >>  	 * basic details.
> >> >>  	 */
> >> >> -	scsi_print_command(hba->lrb[tag].cmd);
> >> >> +	scsi_print_command(cmd);
> >> >>  	if (!hba->req_abort_count) {
> >> >>  		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
> >> >>  		ufshcd_print_host_regs(hba);
> >> >> @@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >> >>  		goto cleanup;
> >> >>  	}
> >> >>
> >> >> +	/*
> >> >> +	 * Task abort to the device W-LUN is illegal. When this command
> >> >> +	 * will fail, due to spec violation, scsi err handling next step
> >> >> +	 * will be to send LU reset which, again, is a spec violation.
> >> >> +	 * To avoid these unnecessary/illegal steps, first we clean up
> >> >> +	 * the lrb taken by this cmd and mark the lrb as in_use, then
> >> >> +	 * queue the eh_work and bail.
> >> >> +	 */
> >> >> +	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
> >> >> +		spin_lock_irqsave(host->host_lock, flags);
> >> >> +		if (lrbp->cmd) {
> >> >> +			__ufshcd_transfer_req_compl(hba, (1UL << tag));
> >> >> +			__set_bit(tag, &hba->outstanding_reqs);
> >> >> +			lrbp->in_use = true;
> >> >> +			hba->force_reset = true;
> >> >> +			ufshcd_schedule_eh_work(hba);
> >> >
> >> > ufshcd_schedule_eh_work() will set hba->ufshcd_state as
> >> > UFSHCD_STATE_EH_SCHEDULED_FATAL. While in this state,
> >> > ufshcd_queuecommand() will set_host_byte(DID_BAD_TARGET) which is
> >> > similar as what you would like to do in this patch.
> >> >
> >> > Is this enough for avoiding reusing tag issue? Just wonder if
> >> > lrpb->in_use flag is really required to be added.
> >> 
> >> Hi Stanley,
> >> 
> >> Thanks for the discussion.
> >> 
> >> To be accurate, it is to prevent lrb from being re-used, not the
> >> tag.
> >> Block layer and/or scsi layer can re-use the tag right after
> >> we abort the cmd, but the lrb is empty since we cleared it from
> >> abort path and we need to make sure the lrb stays empty before the
> >> full reset and restore happens.
> > 
> > What is the definition of "empty" here?
> 
> Yes, it means lrb->cmd is NULL.
> 
> > 
> > If it means lrb->cmd shall be empty (to not invoking scsi_done again),
> > then the hba->ufshcd_state check in ufshcd_queuecommend() will also
> > clear the re-used lrb->cmd if ufshcd_state is in
> > UFSHCD_STATE_EH_SCHEDULED_FATAL case.
> 
> Yes, but that would have a race condition - say a cmd re-uses the tag 
> and
> occupies the lrb in queuecommand, at the same time the real completion 
> IRQ
> of previous cmd comes (althrough we aborted it due to it timed out, but 
> the
> completion IRQ can come at any time before the full reset and restore is
> performed), then the new cmd will be wrongly completed by IRQ handler 
> from
> __ufshcd_transfer_req_compl() which is actually fired by the last cmd, 
> so
> we need to block it even before the cmd takes the lrb - before
> 
> lrbp->cmd = cmd;

I see.
Thanks for the explanation.

> 
> > 
> > However ufshcd_state cannot protect other paths now, for example,
> > ufshcd_exec_dev_cmd(), so lrbp->in_use may be required for this usage,
> > or ufshcd_state check can be added to help.
> > 
> > BTW, would you also need to consider ufshcd_issue_devman_upiu_cmd() 
> > that
> > is another possible path to re-use lrb?
> 
> In this change, there are checks of lrb->in_use in 
> ufshcd_queuecommand(),
> ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd().

Oops I missed it. Sorry for this noise.

Feel free to add
Reviewed-by: Stanley Chu <stanley.chu@mediatek.com>





^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work
  2020-12-03  5:19           ` Stanley Chu
@ 2020-12-03  5:21             ` Can Guo
  0 siblings, 0 replies; 9+ messages in thread
From: Can Guo @ 2020-12-03  5:21 UTC (permalink / raw)
  To: Stanley Chu
  Cc: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, Alim Akhtar, Avri Altman,
	James E.J. Bottomley, Martin K. Petersen, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list

On 2020-12-03 13:19, Stanley Chu wrote:
> On Thu, 2020-12-03 at 13:11 +0800, Can Guo wrote:
>> On 2020-12-03 12:51, Stanley Chu wrote:
>> > On Thu, 2020-12-03 at 12:01 +0800, Can Guo wrote:
>> >> On 2020-12-03 10:21, Stanley Chu wrote:
>> >> > On Wed, 2020-12-02 at 04:04 -0800, Can Guo wrote:
>> >> >> In current task abort routine, if task abort happens to the device
>> >> >> W-LU,
>> >> >> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
>> >> >> full reset and restore then returns FAIL or SUCCESS. Commands sent to
>> >> >> the
>> >> >> device W-LU are most likely the SSU cmds sent during UFS PM
>> >> >> operations. If
>> >> >> such SSU cmd enters task abort routine, when
>> >> >> ufshcd_eh_host_reset_handler()
>> >> >> flushes eh_work, it will get stuck there since err_handler is
>> >> >> serialized
>> >> >> with PM operations.
>> >> >>
>> >> >> In order to unblock above call path, we merely clean up the lrb taken
>> >> >> by
>> >> >> this cmd, queue the eh_work and return SUCCESS. Once the cmd is
>> >> >> aborted,
>> >> >> the PM operation which sends out the cmd just errors out, then
>> >> >> err_handler
>> >> >> shall be able to proceed with the full reset and restore.
>> >> >>
>> >> >> In this scenario, the cmd is aborted even before it is actually
>> >> >> cleared by
>> >> >> HW, set the lrb->in_use flag to prevent subsequent cmds, including
>> >> >> SCSI
>> >> >> cmds and dev cmds, from taking the lrb released from abort. The flag
>> >> >> shall
>> >> >> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the
>> >> >> full
>> >> >> reset and restore from err_handler.
>> >> >>
>> >> >> Reviewed-by: Asutosh Das <asutoshd@codeaurora.org>
>> >> >> Signed-off-by: Can Guo <cang@codeaurora.org>
>> >> >> ---
>> >> >>  drivers/scsi/ufs/ufshcd.c | 60
>> >> >> +++++++++++++++++++++++++++++++++++++----------
>> >> >>  drivers/scsi/ufs/ufshcd.h |  2 ++
>> >> >>  2 files changed, 49 insertions(+), 13 deletions(-)
>> >> >>
>> >> >> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> >> >> index f0bb3fc..26c1fa0 100644
>> >> >> --- a/drivers/scsi/ufs/ufshcd.c
>> >> >> +++ b/drivers/scsi/ufs/ufshcd.c
>> >> >> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host
>> >> >> *host, struct scsi_cmnd *cmd)
>> >> >>  		(hba->clk_gating.state != CLKS_ON));
>> >> >>
>> >> >>  	lrbp = &hba->lrb[tag];
>> >> >> +	if (unlikely(lrbp->in_use)) {
>> >> >> +		if (hba->pm_op_in_progress)
>> >> >> +			set_host_byte(cmd, DID_BAD_TARGET);
>> >> >> +		else
>> >> >> +			err = SCSI_MLQUEUE_HOST_BUSY;
>> >> >> +		ufshcd_release(hba);
>> >> >> +		goto out;
>> >> >> +	}
>> >> >>
>> >> >>  	WARN_ON(lrbp->cmd);
>> >> >>  	lrbp->cmd = cmd;
>> >> >> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba
>> >> >> *hba,
>> >> >>
>> >> >>  	init_completion(&wait);
>> >> >>  	lrbp = &hba->lrb[tag];
>> >> >> +	if (unlikely(lrbp->in_use)) {
>> >> >> +		err = -EBUSY;
>> >> >> +		goto out;
>> >> >> +	}
>> >> >> +
>> >> >>  	WARN_ON(lrbp->cmd);
>> >> >>  	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
>> >> >>  	if (unlikely(err))
>> >> >> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba
>> >> >> *hba,
>> >> >>
>> >> >>  	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
>> >> >>
>> >> >> +out:
>> >> >>  	ufshcd_add_query_upiu_trace(hba, tag,
>> >> >>  			err ? "query_complete_err" : "query_complete");
>> >> >>
>> >> >> @@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct
>> >> >> ufs_hba *hba,
>> >> >>  	struct scsi_cmnd *cmd;
>> >> >>  	int result;
>> >> >>  	int index;
>> >> >> +	bool update_scaling = false;
>> >> >>
>> >> >>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
>> >> >>  		lrbp = &hba->lrb[index];
>> >> >> +		lrbp->in_use = false;
>> >> >>  		lrbp->compl_time_stamp = ktime_get();
>> >> >>  		cmd = lrbp->cmd;
>> >> >>  		if (cmd) {
>> >> >> @@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct
>> >> >> ufs_hba *hba,
>> >> >>  			/* Do not touch lrbp after scsi done */
>> >> >>  			cmd->scsi_done(cmd);
>> >> >>  			__ufshcd_release(hba);
>> >> >> +			update_scaling = true;
>> >> >>  		} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
>> >> >>  			lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
>> >> >>  			if (hba->dev_cmd.complete) {
>> >> >>  				ufshcd_add_command_trace(hba, index,
>> >> >>  						"dev_complete");
>> >> >>  				complete(hba->dev_cmd.complete);
>> >> >> +				update_scaling = true;
>> >> >>  			}
>> >> >>  		}
>> >> >> -		if (ufshcd_is_clkscaling_supported(hba))
>> >> >> +		if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
>> >> >>  			hba->clk_scaling.active_reqs--;
>> >> >>  	}
>> >> >>
>> >> >> @@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct
>> >> >> ufs_hba *hba,
>> >> >>
>> >> >>  	init_completion(&wait);
>> >> >>  	lrbp = &hba->lrb[tag];
>> >> >> -	WARN_ON(lrbp->cmd);
>> >> >> +	if (unlikely(lrbp->in_use)) {
>> >> >> +		err = -EBUSY;
>> >> >> +		goto out;
>> >> >> +	}
>> >> >>
>> >> >> +	WARN_ON(lrbp->cmd);
>> >> >>  	lrbp->cmd = NULL;
>> >> >>  	lrbp->sense_bufflen = 0;
>> >> >>  	lrbp->sense_buffer = NULL;
>> >> >> @@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct
>> >> >> ufs_hba *hba,
>> >> >>  		}
>> >> >>  	}
>> >> >>
>> >> >> +out:
>> >> >>  	blk_put_request(req);
>> >> >>  out_unlock:
>> >> >>  	up_read(&hba->clk_scaling_lock);
>> >> >> @@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>> >> >>  		BUG();
>> >> >>  	}
>> >> >>
>> >> >> -	/*
>> >> >> -	 * Task abort to the device W-LUN is illegal. When this command
>> >> >> -	 * will fail, due to spec violation, scsi err handling next step
>> >> >> -	 * will be to send LU reset which, again, is a spec violation.
>> >> >> -	 * To avoid these unnecessary/illegal step we skip to the last error
>> >> >> -	 * handling stage: reset and restore.
>> >> >> -	 */
>> >> >> -	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
>> >> >> -		return ufshcd_eh_host_reset_handler(cmd);
>> >> >> -
>> >> >>  	ufshcd_hold(hba, false);
>> >> >>  	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>> >> >>  	/* If command is already aborted/completed, return SUCCESS */
>> >> >> @@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>> >> >>  	 * to reduce repeated printouts. For other aborted requests only
>> >> >> print
>> >> >>  	 * basic details.
>> >> >>  	 */
>> >> >> -	scsi_print_command(hba->lrb[tag].cmd);
>> >> >> +	scsi_print_command(cmd);
>> >> >>  	if (!hba->req_abort_count) {
>> >> >>  		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
>> >> >>  		ufshcd_print_host_regs(hba);
>> >> >> @@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
>> >> >>  		goto cleanup;
>> >> >>  	}
>> >> >>
>> >> >> +	/*
>> >> >> +	 * Task abort to the device W-LUN is illegal. When this command
>> >> >> +	 * will fail, due to spec violation, scsi err handling next step
>> >> >> +	 * will be to send LU reset which, again, is a spec violation.
>> >> >> +	 * To avoid these unnecessary/illegal steps, first we clean up
>> >> >> +	 * the lrb taken by this cmd and mark the lrb as in_use, then
>> >> >> +	 * queue the eh_work and bail.
>> >> >> +	 */
>> >> >> +	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
>> >> >> +		spin_lock_irqsave(host->host_lock, flags);
>> >> >> +		if (lrbp->cmd) {
>> >> >> +			__ufshcd_transfer_req_compl(hba, (1UL << tag));
>> >> >> +			__set_bit(tag, &hba->outstanding_reqs);
>> >> >> +			lrbp->in_use = true;
>> >> >> +			hba->force_reset = true;
>> >> >> +			ufshcd_schedule_eh_work(hba);
>> >> >
>> >> > ufshcd_schedule_eh_work() will set hba->ufshcd_state as
>> >> > UFSHCD_STATE_EH_SCHEDULED_FATAL. While in this state,
>> >> > ufshcd_queuecommand() will set_host_byte(DID_BAD_TARGET) which is
>> >> > similar as what you would like to do in this patch.
>> >> >
>> >> > Is this enough for avoiding reusing tag issue? Just wonder if
>> >> > lrpb->in_use flag is really required to be added.
>> >>
>> >> Hi Stanley,
>> >>
>> >> Thanks for the discussion.
>> >>
>> >> To be accurate, it is to prevent lrb from being re-used, not the
>> >> tag.
>> >> Block layer and/or scsi layer can re-use the tag right after
>> >> we abort the cmd, but the lrb is empty since we cleared it from
>> >> abort path and we need to make sure the lrb stays empty before the
>> >> full reset and restore happens.
>> >
>> > What is the definition of "empty" here?
>> 
>> Yes, it means lrb->cmd is NULL.
>> 
>> >
>> > If it means lrb->cmd shall be empty (to not invoking scsi_done again),
>> > then the hba->ufshcd_state check in ufshcd_queuecommend() will also
>> > clear the re-used lrb->cmd if ufshcd_state is in
>> > UFSHCD_STATE_EH_SCHEDULED_FATAL case.
>> 
>> Yes, but that would have a race condition - say a cmd re-uses the tag
>> and
>> occupies the lrb in queuecommand, at the same time the real completion
>> IRQ
>> of previous cmd comes (althrough we aborted it due to it timed out, 
>> but
>> the
>> completion IRQ can come at any time before the full reset and restore 
>> is
>> performed), then the new cmd will be wrongly completed by IRQ handler
>> from
>> __ufshcd_transfer_req_compl() which is actually fired by the last cmd,
>> so
>> we need to block it even before the cmd takes the lrb - before
>> 
>> lrbp->cmd = cmd;
> 
> I see.
> Thanks for the explanation.
> 
>> 
>> >
>> > However ufshcd_state cannot protect other paths now, for example,
>> > ufshcd_exec_dev_cmd(), so lrbp->in_use may be required for this usage,
>> > or ufshcd_state check can be added to help.
>> >
>> > BTW, would you also need to consider ufshcd_issue_devman_upiu_cmd()
>> > that
>> > is another possible path to re-use lrb?
>> 
>> In this change, there are checks of lrb->in_use in
>> ufshcd_queuecommand(),
>> ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd().
> 
> Oops I missed it. Sorry for this noise.
> 
> Feel free to add
> Reviewed-by: Stanley Chu <stanley.chu@mediatek.com>

Thanks for your review! :)

Regards,

Can Guo.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-12-03  5:22 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1606910644-21185-1-git-send-email-cang@codeaurora.org>
2020-12-02 12:04 ` [PATCH V7 1/3] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
2020-12-02 12:04 ` [PATCH V7 2/3] scsi: ufs: Fix a race condition between ufshcd_abort and eh_work Can Guo
2020-12-03  2:21   ` Stanley Chu
2020-12-03  4:01     ` Can Guo
2020-12-03  4:51       ` Stanley Chu
2020-12-03  5:11         ` Can Guo
2020-12-03  5:19           ` Stanley Chu
2020-12-03  5:21             ` Can Guo
2020-12-02 12:04 ` [PATCH V7 3/3] scsi: ufs: Print host regs in IRQ handler when AH8 error happens Can Guo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).