[PATCH v1 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v1 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback
@ 2020-09-22  5:32 Can Guo
  2020-09-22  5:32 ` [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
  2020-09-22  5:32 ` [PATCH v1 2/2] scsi: ufs: Fix a racing problem between ufshcd_abort and eh_work Can Guo
  0 siblings, 2 replies; 6+ messages in thread
From: Can Guo @ 2020-09-22  5:32 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang

This series mainly fixes racing problems btw err_handler and paths like
system PM ops, async scan and task abort callback. 

This series has been tested with error/fault injections to system PM
operations, async scan and task abort to the UFS device W-LU.

Can Guo (2):
  scsi: ufs: Serialize eh_work with system PM events and async scan
  scsi: ufs: Fix a racing problem between ufshcd_abort and eh_work

 drivers/scsi/ufs/ufshcd.c | 122 ++++++++++++++++++++++++++++++++--------------
 drivers/scsi/ufs/ufshcd.h |   3 ++
 2 files changed, 89 insertions(+), 36 deletions(-)

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan
  2020-09-22  5:32 [PATCH v1 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback Can Guo
@ 2020-09-22  5:32 ` Can Guo
  2020-10-20 14:19   ` Bean Huo
  2020-09-22  5:32 ` [PATCH v1 2/2] scsi: ufs: Fix a racing problem between ufshcd_abort and eh_work Can Guo
  1 sibling, 1 reply; 6+ messages in thread
From: Can Guo @ 2020-09-22  5:32 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Bart Van Assche,
	Satya Tangirala, open list

Serialize eh_work with system PM events and async scan to make sure eh_work
does not run in parallel with them.

Change-Id: I33012c68e2ea443950313c59a4a46ad88cf3c82d
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 64 +++++++++++++++++++++++++++++------------------
 drivers/scsi/ufs/ufshcd.h |  1 +
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 1d8134e..7e764e8 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -5597,7 +5597,9 @@ static inline void ufshcd_schedule_eh_work(struct ufs_hba *hba)
 static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 {
 	pm_runtime_get_sync(hba->dev);
-	if (pm_runtime_suspended(hba->dev)) {
+	if (pm_runtime_status_suspended(hba->dev) || hba->is_sys_suspended) {
+		enum ufs_pm_op pm_op;
+
 		/*
 		 * Don't assume anything of pm_runtime_get_sync(), if
 		 * resume fails, irq and clocks can be OFF, and powers
@@ -5612,7 +5614,8 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 		if (!ufshcd_is_clkgating_allowed(hba))
 			ufshcd_setup_clocks(hba, true);
 		ufshcd_release(hba);
-		ufshcd_vops_resume(hba, UFS_RUNTIME_PM);
+		pm_op = hba->is_sys_suspended ? UFS_RUNTIME_PM : UFS_SYSTEM_PM;
+		ufshcd_vops_resume(hba, pm_op);
 	} else {
 		ufshcd_hold(hba, false);
 		if (hba->clk_scaling.is_allowed) {
@@ -5633,7 +5636,7 @@ static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
 
 static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
 {
-	return (hba->ufshcd_state == UFSHCD_STATE_ERROR ||
+	return (!hba->is_powered || hba->ufshcd_state == UFSHCD_STATE_ERROR ||
 		(!(hba->saved_err || hba->saved_uic_err || hba->force_reset ||
 			ufshcd_is_link_broken(hba))));
 }
@@ -5646,6 +5649,7 @@ static void ufshcd_recover_pm_error(struct ufs_hba *hba)
 	struct request_queue *q;
 	int ret;
 
+	hba->is_sys_suspended = false;
 	/*
 	 * Set RPM status of hba device to RPM_ACTIVE,
 	 * this also clears its runtime error.
@@ -5704,11 +5708,13 @@ static void ufshcd_err_handler(struct work_struct *work)
 
 	hba = container_of(work, struct ufs_hba, eh_work);
 
+	down(&hba->eh_sem);
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	if (ufshcd_err_handling_should_stop(hba)) {
 		if (hba->ufshcd_state != UFSHCD_STATE_ERROR)
 			hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
 		spin_unlock_irqrestore(hba->host->host_lock, flags);
+		up(&hba->eh_sem);
 		return;
 	}
 	ufshcd_set_eh_in_progress(hba);
@@ -5716,20 +5722,18 @@ static void ufshcd_err_handler(struct work_struct *work)
 	ufshcd_err_handling_prepare(hba);
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	ufshcd_scsi_block_requests(hba);
-	/*
-	 * A full reset and restore might have happened after preparation
-	 * is finished, double check whether we should stop.
-	 */
-	if (ufshcd_err_handling_should_stop(hba)) {
-		if (hba->ufshcd_state != UFSHCD_STATE_ERROR)
-			hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
-		goto out;
-	}
 	hba->ufshcd_state = UFSHCD_STATE_RESET;
 
 	/* Complete requests that have door-bell cleared by h/w */
 	ufshcd_complete_requests(hba);
 
+	/*
+	 * A full reset and restore might have happened after preparation
+	 * is finished, double check whether we should stop.
+	 */
+	if (ufshcd_err_handling_should_stop(hba))
+		goto skip_err_handling;
+
 	if (hba->dev_quirks & UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS) {
 		bool ret;
 
@@ -5737,17 +5741,10 @@ static void ufshcd_err_handler(struct work_struct *work)
 		/* release the lock as ufshcd_quirk_dl_nac_errors() may sleep */
 		ret = ufshcd_quirk_dl_nac_errors(hba);
 		spin_lock_irqsave(hba->host->host_lock, flags);
-		if (!ret && !hba->force_reset && ufshcd_is_link_active(hba))
+		if (!ret && ufshcd_err_handling_should_stop(hba))
 			goto skip_err_handling;
 	}
 
-	if (hba->force_reset || ufshcd_is_link_broken(hba) ||
-	    ufshcd_is_saved_err_fatal(hba) ||
-	    ((hba->saved_err & UIC_ERROR) &&
-	     (hba->saved_uic_err & (UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
-				    UFSHCD_UIC_DL_TCx_REPLAY_ERROR))))
-		needs_reset = true;
-
 	if ((hba->saved_err & (INT_FATAL_ERRORS | UFSHCD_UIC_HIBERN8_MASK)) ||
 	    (hba->saved_uic_err &&
 	     (hba->saved_uic_err != UFSHCD_UIC_PA_GENERIC_ERROR))) {
@@ -5767,8 +5764,14 @@ static void ufshcd_err_handler(struct work_struct *work)
 	 * transfers forcefully because they will get cleared during
 	 * host reset and restore
 	 */
-	if (needs_reset)
+	if (hba->force_reset || ufshcd_is_link_broken(hba) ||
+	    ufshcd_is_saved_err_fatal(hba) ||
+	    ((hba->saved_err & UIC_ERROR) &&
+	     (hba->saved_uic_err & (UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
+				    UFSHCD_UIC_DL_TCx_REPLAY_ERROR)))) {
+		needs_reset = true;
 		goto do_reset;
+	}
 
 	/*
 	 * If LINERESET was caught, UFS might have been put to PWM mode,
@@ -5876,12 +5879,11 @@ static void ufshcd_err_handler(struct work_struct *work)
 			dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x saved_uic_err 0x%x",
 			    __func__, hba->saved_err, hba->saved_uic_err);
 	}
-
-out:
 	ufshcd_clear_eh_in_progress(hba);
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 	ufshcd_scsi_unblock_requests(hba);
 	ufshcd_err_handling_unprepare(hba);
+	up(&hba->eh_sem);
 }
 
 /**
@@ -6856,6 +6858,7 @@ static int ufshcd_reset_and_restore(struct ufs_hba *hba)
 	 */
 	scsi_report_bus_reset(hba->host, 0);
 	if (err) {
+		hba->ufshcd_state = UFSHCD_STATE_ERROR;
 		hba->saved_err |= saved_err;
 		hba->saved_uic_err |= saved_uic_err;
 	}
@@ -7704,8 +7707,10 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie)
 	struct ufs_hba *hba = (struct ufs_hba *)data;
 	int ret;
 
+	down(&hba->eh_sem);
 	/* Initialize hba, detect and initialize UFS device */
 	ret = ufshcd_probe_hba(hba, true);
+	up(&hba->eh_sem);
 	if (ret)
 		goto out;
 
@@ -8718,6 +8723,7 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
 	int ret = 0;
 	ktime_t start = ktime_get();
 
+	down(&hba->eh_sem);
 	if (!hba || !hba->is_powered)
 		return 0;
 
@@ -8748,6 +8754,8 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
 		hba->is_sys_suspended = true;
+	else
+		up(&hba->eh_sem);
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_suspend);
@@ -8764,8 +8772,10 @@ int ufshcd_system_resume(struct ufs_hba *hba)
 	int ret = 0;
 	ktime_t start = ktime_get();
 
-	if (!hba)
+	if (!hba) {
+		up(&hba->eh_sem);
 		return -EINVAL;
+	}
 
 	if (!hba->is_powered || pm_runtime_suspended(hba->dev))
 		/*
@@ -8781,6 +8791,7 @@ int ufshcd_system_resume(struct ufs_hba *hba)
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
 		hba->is_sys_suspended = false;
+	up(&hba->eh_sem);
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_resume);
@@ -8872,6 +8883,7 @@ int ufshcd_shutdown(struct ufs_hba *hba)
 {
 	int ret = 0;
 
+	down(&hba->eh_sem);
 	if (!hba->is_powered)
 		goto out;
 
@@ -8888,6 +8900,8 @@ int ufshcd_shutdown(struct ufs_hba *hba)
 out:
 	if (ret)
 		dev_err(hba->dev, "%s failed, err %d\n", __func__, ret);
+	hba->is_powered = false;
+	up(&hba->eh_sem);
 	/* allow force shutdown even in case of errors */
 	return 0;
 }
@@ -9082,6 +9096,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	INIT_WORK(&hba->eh_work, ufshcd_err_handler);
 	INIT_WORK(&hba->eeh_work, ufshcd_exception_event_handler);
 
+	sema_init(&hba->eh_sem, 1);
+
 	/* Initialize UIC command mutex */
 	mutex_init(&hba->uic_cmd_mutex);
 
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 47eb143..1e680bf 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -728,6 +728,7 @@ struct ufs_hba {
 	u32 intr_mask;
 	u16 ee_ctrl_mask;
 	bool is_powered;
+	struct semaphore eh_sem;
 
 	/* Work Queues */
 	struct workqueue_struct *eh_wq;
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan
  2020-09-22  5:32 ` [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
@ 2020-10-20 14:19   ` Bean Huo
  2020-10-21  1:18     ` Can Guo
  0 siblings, 1 reply; 6+ messages in thread
From: Bean Huo @ 2020-10-20 14:19 UTC (permalink / raw)
  To: Can Guo, asutoshd, nguyenb, hongwus, rnayak, linux-scsi,
	kernel-team, saravanak, salyzyn
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Bart Van Assche,
	Satya Tangirala, open list


On Mon, 2020-09-21 at 22:32 -0700, Can Guo wrote:
> Serialize eh_work with system PM events and async scan to make sure
> eh_work
> does not run in parallel with them.
> 
> Change-Id: I33012c68e2ea443950313c59a4a46ad88cf3c82d
> Signed-off-by: Can Guo <cang@codeaurora.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 64 +++++++++++++++++++++++++++++------
> ------------
>  drivers/scsi/ufs/ufshcd.h |  1 +
>  2 files changed, 41 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 1d8134e..7e764e8 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -5597,7 +5597,9 @@ static inline void
> ufshcd_schedule_eh_work(struct ufs_hba *hba)
>  static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>  {
>  	pm_runtime_get_sync(hba->dev);
> -	if (pm_runtime_suspended(hba->dev)) {
> +	if (pm_runtime_status_suspended(hba->dev) || hba-
> >is_sys_suspended) {

Hi Can

I curiously want to know how this can be produced in real system.

IMO, The system has been in system PM suspeded mode, how can trigger
error handler? because the tasks have been freezed, the device
interrupts disabled, before entering system PM suspended mode, the
tasks in the queue should be completed, otherwises, there is bug in the
whole flow.


thanks,
Bean


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan
  2020-10-20 14:19   ` Bean Huo
@ 2020-10-21  1:18     ` Can Guo
  0 siblings, 0 replies; 6+ messages in thread
From: Can Guo @ 2020-10-21  1:18 UTC (permalink / raw)
  To: Bean Huo
  Cc: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, Alim Akhtar, Avri Altman,
	James E.J. Bottomley, Martin K. Petersen, Stanley Chu, Bean Huo,
	Bart Van Assche, Satya Tangirala, open list

On 2020-10-20 22:19, Bean Huo wrote:
> On Mon, 2020-09-21 at 22:32 -0700, Can Guo wrote:
>> Serialize eh_work with system PM events and async scan to make sure
>> eh_work
>> does not run in parallel with them.
>> 
>> Change-Id: I33012c68e2ea443950313c59a4a46ad88cf3c82d
>> Signed-off-by: Can Guo <cang@codeaurora.org>
>> ---
>>  drivers/scsi/ufs/ufshcd.c | 64 +++++++++++++++++++++++++++++------
>> ------------
>>  drivers/scsi/ufs/ufshcd.h |  1 +
>>  2 files changed, 41 insertions(+), 24 deletions(-)
>> 
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index 1d8134e..7e764e8 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -5597,7 +5597,9 @@ static inline void
>> ufshcd_schedule_eh_work(struct ufs_hba *hba)
>>  static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>>  {
>>  	pm_runtime_get_sync(hba->dev);
>> -	if (pm_runtime_suspended(hba->dev)) {
>> +	if (pm_runtime_status_suspended(hba->dev) || hba-
>> >is_sys_suspended) {
> 
> Hi Can
> 
> I curiously want to know how this can be produced in real system.
> 
> IMO, The system has been in system PM suspeded mode, how can trigger
> error handler? because the tasks have been freezed, the device
> interrupts disabled, before entering system PM suspended mode, the
> tasks in the queue should be completed, otherwises, there is bug in the
> whole flow.
> 
> 
> thanks,
> Bean

Hi Bean,

You might misunderstand here - the hba->is_sys_suspended check is
not for the case that system has entered suspend, but for the case a 
resume
failure happens. If system resume fails, hba->is_sys_suspended is set 
and
powers/clks/IRQs are OFF, so we need to prepare the environments for 
err_handler to run.
To reproduce this scenario, you can fake a hibern8 exit failure during 
system resume.

If the whole system has entered suspend, of course err_handler won't 
run.
I guess your real concern is that if UFS has entered system suspend (not 
the whole system),
but err_handling was somehow scheduled (most likely due to an non-fatal 
error).
This definitely is a problem and it is also why I make this change. With 
this change,
if UFS has entered system suspend, the eh_sem lock is held, err_handler 
won't even get
a chance to run, the err_handler will run only after UFS is resumed.

Thanks,

Can Guo.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v1 2/2] scsi: ufs: Fix a racing problem between ufshcd_abort and eh_work
  2020-09-22  5:32 [PATCH v1 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback Can Guo
  2020-09-22  5:32 ` [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
@ 2020-09-22  5:32 ` Can Guo
  1 sibling, 0 replies; 6+ messages in thread
From: Can Guo @ 2020-09-22  5:32 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Bart Van Assche,
	Satya Tangirala, open list

In current task abort routine, if task abort happens to the device W-LU,
the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
full reset and restore then returns FAIL or SUCCESS. Commands sent to the
device W-LU are most likely the SSU cmds sent during UFS PM operations. If
such SSU cmd enters task abort routine, when ufshcd_eh_host_reset_handler()
flushes eh_work, there will be racing because err_handler is serialized
with any PM operations.

Since the main idea of aborting one cmd to the device W-LU is to perform
a full reset and restore, in order to resolve the racing problem, we merely
clean up the lrb taken by this cmd, queue the eh_work and abort the cmd.
Since the cmd has been aborted, the PM operation which sends the cmd simply
errors out, thus err_handler will not be blocked by ongoing PM operations
and err_handler can also recover PM error if any, which comes as another
benefit of this change.

Because such cmd is aborted even before it is actually cleared from HW, set
the lrb->in_use flag to prevent subsequent cmds, including SCSI cmds and
dev cmds, from taking the lrb released by this cmd. Flag lrb->in_use shall
evetually be cleared in __ufshcd_transfer_req_compl() invoked by the full
reset and restore from err_handler.

Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 58 +++++++++++++++++++++++++++++++++++++----------
 drivers/scsi/ufs/ufshcd.h |  2 ++
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 7e764e8..e4cb994 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 		(hba->clk_gating.state != CLKS_ON));
 
 	lrbp = &hba->lrb[tag];
+	if (unlikely(lrbp->in_use)) {
+		if (hba->pm_op_in_progress)
+			set_host_byte(cmd, DID_BAD_TARGET);
+		else
+			err = SCSI_MLQUEUE_HOST_BUSY;
+		ufshcd_release(hba);
+		goto out;
+	}
 
 	WARN_ON(lrbp->cmd);
 	lrbp->cmd = cmd;
@@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
 
 	init_completion(&wait);
 	lrbp = &hba->lrb[tag];
+	if (unlikely(lrbp->in_use)) {
+		err = -EBUSY;
+		goto out;
+	}
+
 	WARN_ON(lrbp->cmd);
 	err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
 	if (unlikely(err))
@@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
 
 	err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
 
+out:
 	ufshcd_add_query_upiu_trace(hba, tag,
 			err ? "query_complete_err" : "query_complete");
 
@@ -4932,6 +4946,7 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
 
 	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
 		lrbp = &hba->lrb[index];
+		lrbp->in_use = false;
 		lrbp->compl_time_stamp = ktime_get();
 		cmd = lrbp->cmd;
 		if (cmd) {
@@ -6374,8 +6389,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 
 	init_completion(&wait);
 	lrbp = &hba->lrb[tag];
-	WARN_ON(lrbp->cmd);
+	if (unlikely(lrbp->in_use)) {
+		err = -EBUSY;
+		goto out;
+	}
 
+	WARN_ON(lrbp->cmd);
 	lrbp->cmd = NULL;
 	lrbp->sense_bufflen = 0;
 	lrbp->sense_buffer = NULL;
@@ -6447,6 +6466,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 		}
 	}
 
+out:
 	blk_put_request(req);
 out_unlock:
 	up_read(&hba->clk_scaling_lock);
@@ -6696,16 +6716,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		BUG();
 	}
 
-	/*
-	 * Task abort to the device W-LUN is illegal. When this command
-	 * will fail, due to spec violation, scsi err handling next step
-	 * will be to send LU reset which, again, is a spec violation.
-	 * To avoid these unnecessary/illegal step we skip to the last error
-	 * handling stage: reset and restore.
-	 */
-	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
-		return ufshcd_eh_host_reset_handler(cmd);
-
 	ufshcd_hold(hba, false);
 	reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
 	/* If command is already aborted/completed, return SUCCESS */
@@ -6726,7 +6736,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 	 * to reduce repeated printouts. For other aborted requests only print
 	 * basic details.
 	 */
-	scsi_print_command(hba->lrb[tag].cmd);
+	scsi_print_command(cmd);
 	if (!hba->req_abort_count) {
 		ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
 		ufshcd_print_host_regs(hba);
@@ -6745,6 +6755,30 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
 		goto cleanup;
 	}
 
+	/*
+	 * Task abort to the device W-LUN is illegal. When this command
+	 * will fail, due to spec violation, scsi err handling next step
+	 * will be to send LU reset which, again, is a spec violation.
+	 * To avoid these unnecessary/illegal steps, first we clean up
+	 * the lrb taken by this cmd and mark the lrb as in_use, then
+	 * queue the eh_work and bail.
+	 */
+	if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
+		struct scsi_cmnd *cmd_in_lrb;
+
+		spin_lock_irqsave(host->host_lock, flags);
+		cmd_in_lrb = lrbp->cmd;
+		if (cmd_in_lrb) {
+			__ufshcd_transfer_req_compl(hba, (1UL << tag));
+			__set_bit(tag, &hba->outstanding_reqs);
+			lrbp->in_use = true;
+			hba->force_reset = true;
+			ufshcd_schedule_eh_work(hba);
+		}
+		spin_unlock_irqrestore(host->host_lock, flags);
+		goto out;
+	}
+
 	/* Skip task abort in case previous aborts failed and report failure */
 	if (lrbp->req_abort_skip)
 		err = -EIO;
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 1e680bf..66e5338 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
  * @crypto_key_slot: the key slot to use for inline crypto (-1 if none)
  * @data_unit_num: the data unit number for the first block for inline crypto
  * @req_abort_skip: skip request abort task flag
+ * @in_use: indicates that this lrb is still in use
  */
 struct ufshcd_lrb {
 	struct utp_transfer_req_desc *utr_descriptor_ptr;
@@ -192,6 +193,7 @@ struct ufshcd_lrb {
 #endif
 
 	bool req_abort_skip;
+	bool in_use;
 };
 
 /**
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback
@ 2020-09-22  6:47 Can Guo
  2020-09-22  6:47 ` [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
  0 siblings, 1 reply; 6+ messages in thread
From: Can Guo @ 2020-09-22  6:47 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang

This series mainly fixes racing problems btw err_handler and paths like
system PM ops, async scan and task abort callback. 

This series has been tested with error/fault injections to system PM
operations, async scan and task abort to the UFS device W-LU.

Change since v1:
- Removed Change-Id from commit msg

Can Guo (2):
  scsi: ufs: Serialize eh_work with system PM events and async scan
  scsi: ufs: Fix a racing problem between ufshcd_abort and eh_work

 drivers/scsi/ufs/ufshcd.c | 122 ++++++++++++++++++++++++++++++++--------------
 drivers/scsi/ufs/ufshcd.h |   3 ++
 2 files changed, 89 insertions(+), 36 deletions(-)

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan
  2020-09-22  6:47 [PATCH v2 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback Can Guo
@ 2020-09-22  6:47 ` Can Guo
  0 siblings, 0 replies; 6+ messages in thread
From: Can Guo @ 2020-09-22  6:47 UTC (permalink / raw)
  To: asutoshd, nguyenb, hongwus, rnayak, linux-scsi, kernel-team,
	saravanak, salyzyn, cang
  Cc: Alim Akhtar, Avri Altman, James E.J. Bottomley,
	Martin K. Petersen, Stanley Chu, Bean Huo, Bart Van Assche,
	Satya Tangirala, open list

Serialize eh_work with system PM events and async scan to make sure eh_work
does not run in parallel with them.

Change-Id: I33012c68e2ea443950313c59a4a46ad88cf3c82d
Signed-off-by: Can Guo <cang@codeaurora.org>
---
 drivers/scsi/ufs/ufshcd.c | 64 +++++++++++++++++++++++++++++------------------
 drivers/scsi/ufs/ufshcd.h |  1 +
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 1d8134e..7e764e8 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -5597,7 +5597,9 @@ static inline void ufshcd_schedule_eh_work(struct ufs_hba *hba)
 static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 {
 	pm_runtime_get_sync(hba->dev);
-	if (pm_runtime_suspended(hba->dev)) {
+	if (pm_runtime_status_suspended(hba->dev) || hba->is_sys_suspended) {
+		enum ufs_pm_op pm_op;
+
 		/*
 		 * Don't assume anything of pm_runtime_get_sync(), if
 		 * resume fails, irq and clocks can be OFF, and powers
@@ -5612,7 +5614,8 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 		if (!ufshcd_is_clkgating_allowed(hba))
 			ufshcd_setup_clocks(hba, true);
 		ufshcd_release(hba);
-		ufshcd_vops_resume(hba, UFS_RUNTIME_PM);
+		pm_op = hba->is_sys_suspended ? UFS_RUNTIME_PM : UFS_SYSTEM_PM;
+		ufshcd_vops_resume(hba, pm_op);
 	} else {
 		ufshcd_hold(hba, false);
 		if (hba->clk_scaling.is_allowed) {
@@ -5633,7 +5636,7 @@ static void ufshcd_err_handling_unprepare(struct ufs_hba *hba)
 
 static inline bool ufshcd_err_handling_should_stop(struct ufs_hba *hba)
 {
-	return (hba->ufshcd_state == UFSHCD_STATE_ERROR ||
+	return (!hba->is_powered || hba->ufshcd_state == UFSHCD_STATE_ERROR ||
 		(!(hba->saved_err || hba->saved_uic_err || hba->force_reset ||
 			ufshcd_is_link_broken(hba))));
 }
@@ -5646,6 +5649,7 @@ static void ufshcd_recover_pm_error(struct ufs_hba *hba)
 	struct request_queue *q;
 	int ret;
 
+	hba->is_sys_suspended = false;
 	/*
 	 * Set RPM status of hba device to RPM_ACTIVE,
 	 * this also clears its runtime error.
@@ -5704,11 +5708,13 @@ static void ufshcd_err_handler(struct work_struct *work)
 
 	hba = container_of(work, struct ufs_hba, eh_work);
 
+	down(&hba->eh_sem);
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	if (ufshcd_err_handling_should_stop(hba)) {
 		if (hba->ufshcd_state != UFSHCD_STATE_ERROR)
 			hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
 		spin_unlock_irqrestore(hba->host->host_lock, flags);
+		up(&hba->eh_sem);
 		return;
 	}
 	ufshcd_set_eh_in_progress(hba);
@@ -5716,20 +5722,18 @@ static void ufshcd_err_handler(struct work_struct *work)
 	ufshcd_err_handling_prepare(hba);
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	ufshcd_scsi_block_requests(hba);
-	/*
-	 * A full reset and restore might have happened after preparation
-	 * is finished, double check whether we should stop.
-	 */
-	if (ufshcd_err_handling_should_stop(hba)) {
-		if (hba->ufshcd_state != UFSHCD_STATE_ERROR)
-			hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
-		goto out;
-	}
 	hba->ufshcd_state = UFSHCD_STATE_RESET;
 
 	/* Complete requests that have door-bell cleared by h/w */
 	ufshcd_complete_requests(hba);
 
+	/*
+	 * A full reset and restore might have happened after preparation
+	 * is finished, double check whether we should stop.
+	 */
+	if (ufshcd_err_handling_should_stop(hba))
+		goto skip_err_handling;
+
 	if (hba->dev_quirks & UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS) {
 		bool ret;
 
@@ -5737,17 +5741,10 @@ static void ufshcd_err_handler(struct work_struct *work)
 		/* release the lock as ufshcd_quirk_dl_nac_errors() may sleep */
 		ret = ufshcd_quirk_dl_nac_errors(hba);
 		spin_lock_irqsave(hba->host->host_lock, flags);
-		if (!ret && !hba->force_reset && ufshcd_is_link_active(hba))
+		if (!ret && ufshcd_err_handling_should_stop(hba))
 			goto skip_err_handling;
 	}
 
-	if (hba->force_reset || ufshcd_is_link_broken(hba) ||
-	    ufshcd_is_saved_err_fatal(hba) ||
-	    ((hba->saved_err & UIC_ERROR) &&
-	     (hba->saved_uic_err & (UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
-				    UFSHCD_UIC_DL_TCx_REPLAY_ERROR))))
-		needs_reset = true;
-
 	if ((hba->saved_err & (INT_FATAL_ERRORS | UFSHCD_UIC_HIBERN8_MASK)) ||
 	    (hba->saved_uic_err &&
 	     (hba->saved_uic_err != UFSHCD_UIC_PA_GENERIC_ERROR))) {
@@ -5767,8 +5764,14 @@ static void ufshcd_err_handler(struct work_struct *work)
 	 * transfers forcefully because they will get cleared during
 	 * host reset and restore
 	 */
-	if (needs_reset)
+	if (hba->force_reset || ufshcd_is_link_broken(hba) ||
+	    ufshcd_is_saved_err_fatal(hba) ||
+	    ((hba->saved_err & UIC_ERROR) &&
+	     (hba->saved_uic_err & (UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
+				    UFSHCD_UIC_DL_TCx_REPLAY_ERROR)))) {
+		needs_reset = true;
 		goto do_reset;
+	}
 
 	/*
 	 * If LINERESET was caught, UFS might have been put to PWM mode,
@@ -5876,12 +5879,11 @@ static void ufshcd_err_handler(struct work_struct *work)
 			dev_err_ratelimited(hba->dev, "%s: exit: saved_err 0x%x saved_uic_err 0x%x",
 			    __func__, hba->saved_err, hba->saved_uic_err);
 	}
-
-out:
 	ufshcd_clear_eh_in_progress(hba);
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 	ufshcd_scsi_unblock_requests(hba);
 	ufshcd_err_handling_unprepare(hba);
+	up(&hba->eh_sem);
 }
 
 /**
@@ -6856,6 +6858,7 @@ static int ufshcd_reset_and_restore(struct ufs_hba *hba)
 	 */
 	scsi_report_bus_reset(hba->host, 0);
 	if (err) {
+		hba->ufshcd_state = UFSHCD_STATE_ERROR;
 		hba->saved_err |= saved_err;
 		hba->saved_uic_err |= saved_uic_err;
 	}
@@ -7704,8 +7707,10 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie)
 	struct ufs_hba *hba = (struct ufs_hba *)data;
 	int ret;
 
+	down(&hba->eh_sem);
 	/* Initialize hba, detect and initialize UFS device */
 	ret = ufshcd_probe_hba(hba, true);
+	up(&hba->eh_sem);
 	if (ret)
 		goto out;
 
@@ -8718,6 +8723,7 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
 	int ret = 0;
 	ktime_t start = ktime_get();
 
+	down(&hba->eh_sem);
 	if (!hba || !hba->is_powered)
 		return 0;
 
@@ -8748,6 +8754,8 @@ int ufshcd_system_suspend(struct ufs_hba *hba)
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
 		hba->is_sys_suspended = true;
+	else
+		up(&hba->eh_sem);
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_suspend);
@@ -8764,8 +8772,10 @@ int ufshcd_system_resume(struct ufs_hba *hba)
 	int ret = 0;
 	ktime_t start = ktime_get();
 
-	if (!hba)
+	if (!hba) {
+		up(&hba->eh_sem);
 		return -EINVAL;
+	}
 
 	if (!hba->is_powered || pm_runtime_suspended(hba->dev))
 		/*
@@ -8781,6 +8791,7 @@ int ufshcd_system_resume(struct ufs_hba *hba)
 		hba->curr_dev_pwr_mode, hba->uic_link_state);
 	if (!ret)
 		hba->is_sys_suspended = false;
+	up(&hba->eh_sem);
 	return ret;
 }
 EXPORT_SYMBOL(ufshcd_system_resume);
@@ -8872,6 +8883,7 @@ int ufshcd_shutdown(struct ufs_hba *hba)
 {
 	int ret = 0;
 
+	down(&hba->eh_sem);
 	if (!hba->is_powered)
 		goto out;
 
@@ -8888,6 +8900,8 @@ int ufshcd_shutdown(struct ufs_hba *hba)
 out:
 	if (ret)
 		dev_err(hba->dev, "%s failed, err %d\n", __func__, ret);
+	hba->is_powered = false;
+	up(&hba->eh_sem);
 	/* allow force shutdown even in case of errors */
 	return 0;
 }
@@ -9082,6 +9096,8 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	INIT_WORK(&hba->eh_work, ufshcd_err_handler);
 	INIT_WORK(&hba->eeh_work, ufshcd_exception_event_handler);
 
+	sema_init(&hba->eh_sem, 1);
+
 	/* Initialize UIC command mutex */
 	mutex_init(&hba->uic_cmd_mutex);
 
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 47eb143..1e680bf 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -728,6 +728,7 @@ struct ufs_hba {
 	u32 intr_mask;
 	u16 ee_ctrl_mask;
 	bool is_powered;
+	struct semaphore eh_sem;
 
 	/* Work Queues */
 	struct workqueue_struct *eh_wq;
-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-10-21  1:19 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-22  5:32 [PATCH v1 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback Can Guo
2020-09-22  5:32 ` [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo
2020-10-20 14:19   ` Bean Huo
2020-10-21  1:18     ` Can Guo
2020-09-22  5:32 ` [PATCH v1 2/2] scsi: ufs: Fix a racing problem between ufshcd_abort and eh_work Can Guo
2020-09-22  6:47 [PATCH v2 0/2] Fix some racing problems btw err_handler and paths like system PM ops and the task abort callback Can Guo
2020-09-22  6:47 ` [PATCH v1 1/2] scsi: ufs: Serialize eh_work with system PM events and async scan Can Guo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.