[PATCH v2 1/2] scsi: fix race between simultaneous decrements of ->host_failed

* [PATCH v2 1/2] scsi: fix race between simultaneous decrements of ->host_failed
@ 2016-05-31  8:38 Wei Fang
  2016-05-31  8:38 ` [PATCH v2 2/2] Documentation/scsi: update scsi_eh.txt about ->host_failed Wei Fang
                   ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Wei Fang @ 2016-05-31  8:38 UTC (permalink / raw)
  To: tj, jejb, martin.petersen, corbet
  Cc: hch, dan.j.williams, linux-ide, linux-scsi, linux-doc, Wei Fang

sas_ata_strategy_handler() adds the works of the ata error handler
to system_unbound_wq. This workqueue asynchronously runs work items,
so the ata error handler will be performed concurrently on different
CPUs. In this case, ->host_failed will be decreased simultaneously in
scsi_eh_finish_cmd() on different CPUs, and become abnormal.

It will lead to permanently inequal between ->host_failed and
 ->host_busy, and scsi error handler thread won't become running.
IO errors after that won't be handled forever.

Use atomic type for ->host_failed to fix this race.

This fixes the problem introduced in
commit 50824d6c5657 ("[SCSI] libsas: async ata-eh").

Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Wei Fang <fangwei1@huawei.com>
---
 drivers/ata/libata-eh.c             |    2 +-
 drivers/scsi/libsas/sas_scsi_host.c |    5 +++--
 drivers/scsi/scsi.c                 |    2 +-
 drivers/scsi/scsi_error.c           |   15 +++++++++------
 drivers/scsi/scsi_lib.c             |    3 ++-
 include/scsi/scsi_host.h            |    2 +-
 6 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 961acc7..a0e7612 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -606,7 +606,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 	ata_scsi_port_error_handler(host, ap);
 
 	/* finish or retry handled scmd's and clean up */
-	WARN_ON(host->host_failed || !list_empty(&eh_work_q));
+	WARN_ON(atomic_read(&host->host_failed) || !list_empty(&eh_work_q));
 
 	DPRINTK("EXIT\n");
 }
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 519dac4..8d74003 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -757,7 +757,8 @@ retry:
 	spin_unlock_irq(shost->host_lock);
 
 	SAS_DPRINTK("Enter %s busy: %d failed: %d\n",
-		    __func__, atomic_read(&shost->host_busy), shost->host_failed);
+		    __func__, atomic_read(&shost->host_busy),
+		    atomic_read(&shost->host_failed));
 	/*
 	 * Deal with commands that still have SAS tasks (i.e. they didn't
 	 * complete via the normal sas_task completion mechanism),
@@ -800,7 +801,7 @@ out:
 
 	SAS_DPRINTK("--- Exit %s: busy: %d failed: %d tries: %d\n",
 		    __func__, atomic_read(&shost->host_busy),
-		    shost->host_failed, tries);
+		    atomic_read(&shost->host_failed), tries);
 }
 
 enum blk_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1deb6ad..7840915 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -527,7 +527,7 @@ void scsi_log_completion(struct scsi_cmnd *cmd, int disposition)
 				scmd_printk(KERN_INFO, cmd,
 					    "scsi host busy %d failed %d\n",
 					    atomic_read(&cmd->device->host->host_busy),
-					    cmd->device->host->host_failed);
+					    atomic_read(&cmd->device->host->host_failed));
 		}
 	}
 }
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 984ddcb..f37666f 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -62,7 +62,8 @@ static int scsi_try_to_abort_cmd(struct scsi_host_template *,
 /* called with shost->host_lock held */
 void scsi_eh_wakeup(struct Scsi_Host *shost)
 {
-	if (atomic_read(&shost->host_busy) == shost->host_failed) {
+	if (atomic_read(&shost->host_busy) ==
+	    atomic_read(&shost->host_failed)) {
 		trace_scsi_eh_wakeup(shost);
 		wake_up_process(shost->ehandler);
 		SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost,
@@ -250,7 +251,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
 		eh_flag &= ~SCSI_EH_CANCEL_CMD;
 	scmd->eh_eflags |= eh_flag;
 	list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
-	shost->host_failed++;
+	atomic_inc(&shost->host_failed);
 	scsi_eh_wakeup(shost);
  out_unlock:
 	spin_unlock_irqrestore(shost->host_lock, flags);
@@ -1127,7 +1128,7 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn)
  */
 void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q)
 {
-	scmd->device->host->host_failed--;
+	atomic_dec(&scmd->device->host->host_failed);
 	scmd->eh_eflags = 0;
 	list_move_tail(&scmd->eh_entry, done_q);
 }
@@ -2190,8 +2191,10 @@ int scsi_error_handler(void *data)
 		if (kthread_should_stop())
 			break;
 
-		if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) ||
-		    shost->host_failed != atomic_read(&shost->host_busy)) {
+		if ((atomic_read(&shost->host_failed) == 0 &&
+		     shost->host_eh_scheduled == 0) ||
+		    (atomic_read(&shost->host_failed) !=
+		     atomic_read(&shost->host_busy))) {
 			SCSI_LOG_ERROR_RECOVERY(1,
 				shost_printk(KERN_INFO, shost,
 					     "scsi_eh_%d: sleeping\n",
@@ -2205,7 +2208,7 @@ int scsi_error_handler(void *data)
 			shost_printk(KERN_INFO, shost,
 				     "scsi_eh_%d: waking up %d/%d/%d\n",
 				     shost->host_no, shost->host_eh_scheduled,
-				     shost->host_failed,
+				     atomic_read(&shost->host_failed),
 				     atomic_read(&shost->host_busy)));
 
 		/*
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8106515..fb3cc5d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -318,7 +318,8 @@ void scsi_device_unbusy(struct scsi_device *sdev)
 		atomic_dec(&starget->target_busy);
 
 	if (unlikely(scsi_host_in_recovery(shost) &&
-		     (shost->host_failed || shost->host_eh_scheduled))) {
+		     (atomic_read(&shost->host_failed) ||
+		      shost->host_eh_scheduled))) {
 		spin_lock_irqsave(shost->host_lock, flags);
 		scsi_eh_wakeup(shost);
 		spin_unlock_irqrestore(shost->host_lock, flags);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index fcfa3d7..654435f 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -576,7 +576,7 @@ struct Scsi_Host {
 	atomic_t host_busy;		   /* commands actually active on low-level */
 	atomic_t host_blocked;
 
-	unsigned int host_failed;	   /* commands that failed.
+	atomic_t host_failed;		   /* commands that failed.
 					      protected by host_lock */
 	unsigned int host_eh_scheduled;    /* EH scheduled without command */
     
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread