All of lore.kernel.org
 help / color / mirror / Atom feed
* blk-mq timeout handling fixes
@ 2014-09-13 23:40 Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 1/6] blk-mq: remove REQ_END Christoph Hellwig
                   ` (6 more replies)
  0 siblings, 7 replies; 11+ messages in thread
From: Christoph Hellwig @ 2014-09-13 23:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Robert Elliott, linux-scsi, linux-kernel

This series fixes various issues with timeout handling that Robert
ran into when testing scsi-mq heavily.  He tested an earlier version,
and couldn't reproduce the issues anymore, although the series changed
quite significantly since and should probably be retested.

In summary we not only start the blk-mq timer inside the drivers
->queue_rq method after the request has been fully setup, and we
also tell the drivers if we're timing out a reserved (internal)
request or a real one.  Many drivers including will need to handle
those internal ones differently, e.g. for scsi-mq we don't even
have a scsi command structure allocated for the reserved commands.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/6] blk-mq: remove REQ_END
  2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
@ 2014-09-13 23:40 ` Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq Christoph Hellwig
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2014-09-13 23:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Robert Elliott, linux-scsi, linux-kernel

Pass an explicit parameter for the last request in a batch to ->queue_rq
instead of using a request flag.  Besides being a cleaner and non-stateful
interface this is also required for the next patch, which fixes the blk-mq
I/O submission code to not start a time too early.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c                    | 22 +++++-----------------
 drivers/block/mtip32xx/mtip32xx.c |  3 ++-
 drivers/block/null_blk.c          |  3 ++-
 drivers/block/virtio_blk.c        |  4 ++--
 drivers/scsi/scsi_lib.c           |  3 ++-
 include/linux/blk-mq.h            |  2 +-
 include/linux/blk_types.h         |  2 --
 7 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 383ea0c..42c94c8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -380,7 +380,7 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static void blk_mq_start_request(struct request *rq, bool last)
+static void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
@@ -411,16 +411,6 @@ static void blk_mq_start_request(struct request *rq, bool last)
 		 */
 		rq->nr_phys_segments++;
 	}
-
-	/*
-	 * Flag the last request in the series so that drivers know when IO
-	 * should be kicked off, if they don't do it on a per-request basis.
-	 *
-	 * Note: the flag isn't the only condition drivers should do kick off.
-	 * If drive is busy, the last request might not have the bit set.
-	 */
-	if (last)
-		rq->cmd_flags |= REQ_END;
 }
 
 static void __blk_mq_requeue_request(struct request *rq)
@@ -430,8 +420,6 @@ static void __blk_mq_requeue_request(struct request *rq)
 	trace_block_rq_requeue(q, rq);
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 
-	rq->cmd_flags &= ~REQ_END;
-
 	if (q->dma_drain_size && blk_rq_bytes(rq))
 		rq->nr_phys_segments--;
 }
@@ -741,9 +729,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		rq = list_first_entry(&rq_list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 
-		blk_mq_start_request(rq, list_empty(&rq_list));
+		blk_mq_start_request(rq);
 
-		ret = q->mq_ops->queue_rq(hctx, rq);
+		ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
 		switch (ret) {
 		case BLK_MQ_RQ_QUEUE_OK:
 			queued++;
@@ -1189,14 +1177,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		int ret;
 
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_start_request(rq, true);
+		blk_mq_start_request(rq);
 
 		/*
 		 * For OK queue, we are done. For error, kill it. Any other
 		 * error (busy), just add it to our list as we previously
 		 * would have done
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, rq);
+		ret = q->mq_ops->queue_rq(data.hctx, rq, true);
 		if (ret == BLK_MQ_RQ_QUEUE_OK)
 			goto done;
 		else {
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 5c8e7fe..0e2084f 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3775,7 +3775,8 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
 	return false;
 }
 
-static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
+		bool last)
 {
 	int ret;
 
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 00d469c..c5b7315 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -313,7 +313,8 @@ static void null_request_fn(struct request_queue *q)
 	}
 }
 
-static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
+		bool last)
 {
 	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 0a58140..13756e0 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -164,14 +164,14 @@ static void virtblk_done(struct virtqueue *vq)
 	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
-static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
+static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
+		bool last)
 {
 	struct virtio_blk *vblk = hctx->queue->queuedata;
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	unsigned long flags;
 	unsigned int num;
 	int qid = hctx->queue_num;
-	const bool last = (req->cmd_flags & REQ_END) != 0;
 	int err;
 	bool notify = false;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d837dc1..1ec00ba 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1855,7 +1855,8 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
 	blk_mq_complete_request(cmd->request);
 }
 
-static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
+static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
+		bool last)
 {
 	struct request_queue *q = req->q;
 	struct scsi_device *sdev = q->queuedata;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a1e31f2..9c4e306 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -77,7 +77,7 @@ struct blk_mq_tag_set {
 	struct list_head	tag_list;
 };
 
-typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
+typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 66c2167..bb7d664 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -188,7 +188,6 @@ enum rq_flag_bits {
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
 	__REQ_KERNEL, 		/* direct IO to kernel pages */
 	__REQ_PM,		/* runtime pm request */
-	__REQ_END,		/* last of chain of requests */
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NR_BITS,		/* stops here */
@@ -242,7 +241,6 @@ enum rq_flag_bits {
 #define REQ_SECURE		(1ULL << __REQ_SECURE)
 #define REQ_KERNEL		(1ULL << __REQ_KERNEL)
 #define REQ_PM			(1ULL << __REQ_PM)
-#define REQ_END			(1ULL << __REQ_END)
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq
  2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 1/6] blk-mq: remove REQ_END Christoph Hellwig
@ 2014-09-13 23:40 ` Christoph Hellwig
  2014-09-15  6:34   ` Ming Lei
  2014-09-15  7:27   ` Ming Lei
  2014-09-13 23:40 ` [PATCH 3/6] blk-mq: rename blk_mq_end_io to blk_mq_end_request Christoph Hellwig
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 11+ messages in thread
From: Christoph Hellwig @ 2014-09-13 23:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Robert Elliott, linux-scsi, linux-kernel

When we call blk_mq_start_request from the core blk-mq code before calling into
->queue_rq there is a racy window where the timeout handler can hit before we've
fully set up the driver specific part of the command.

Move the call to blk_mq_start_request into the driver so the driver can start
the request only once it is fully set up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c                    | 13 ++++++-------
 drivers/block/mtip32xx/mtip32xx.c |  2 ++
 drivers/block/null_blk.c          |  2 ++
 drivers/block/virtio_blk.c        |  2 ++
 drivers/scsi/scsi_lib.c           |  1 +
 include/linux/blk-mq.h            |  1 +
 6 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 42c94c8..c3333ab 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -380,7 +380,7 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static void blk_mq_start_request(struct request *rq)
+void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
@@ -412,16 +412,18 @@ static void blk_mq_start_request(struct request *rq)
 		rq->nr_phys_segments++;
 	}
 }
+EXPORT_SYMBOL(blk_mq_start_request);
 
 static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
 	trace_block_rq_requeue(q, rq);
-	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 
-	if (q->dma_drain_size && blk_rq_bytes(rq))
-		rq->nr_phys_segments--;
+	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+		if (q->dma_drain_size && blk_rq_bytes(rq))
+			rq->nr_phys_segments--;
+	}
 }
 
 void blk_mq_requeue_request(struct request *rq)
@@ -729,8 +731,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		rq = list_first_entry(&rq_list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 
-		blk_mq_start_request(rq);
-
 		ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
 		switch (ret) {
 		case BLK_MQ_RQ_QUEUE_OK:
@@ -1177,7 +1177,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		int ret;
 
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_start_request(rq);
 
 		/*
 		 * For OK queue, we are done. For error, kill it. Any other
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 0e2084f..4042440 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3783,6 +3783,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	if (unlikely(mtip_check_unal_depth(hctx, rq)))
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
+	blk_mq_start_request(rq);
+
 	ret = mtip_submit_request(hctx, rq);
 	if (likely(!ret))
 		return BLK_MQ_RQ_QUEUE_OK;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index c5b7315..332ce20 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -321,6 +321,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	cmd->rq = rq;
 	cmd->nq = hctx->driver_data;
 
+	blk_mq_start_request(rq);
+
 	null_handle_cmd(cmd);
 	return BLK_MQ_RQ_QUEUE_OK;
 }
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 13756e0..83816bf 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -205,6 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
 		}
 	}
 
+	blk_mq_start_request(req);
+
 	num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
 	if (num) {
 		if (rq_data_dir(vbr->req) == WRITE)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1ec00ba..b06a355 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1890,6 +1890,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
 	scsi_init_cmd_errh(cmd);
 	cmd->scsi_done = scsi_mq_done;
 
+	blk_mq_start_request(req);
 	reason = scsi_dispatch_cmd(cmd);
 	if (reason) {
 		scsi_set_blocked(cmd, reason);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9c4e306..878b6f7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -159,6 +159,7 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
 
+void blk_mq_start_request(struct request *rq);
 void blk_mq_end_io(struct request *rq, int error);
 void __blk_mq_end_io(struct request *rq, int error);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/6] blk-mq: rename blk_mq_end_io to blk_mq_end_request
  2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 1/6] blk-mq: remove REQ_END Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq Christoph Hellwig
@ 2014-09-13 23:40 ` Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 4/6] blk-mq: fix and simplify tag iteration for the timeout handler Christoph Hellwig
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2014-09-13 23:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Robert Elliott, linux-scsi, linux-kernel

Now that we've changed the driver API on the submission side use the
opportunity to fix up the name on the completion side to fit into the
general scheme.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-flush.c                 |  4 ++--
 block/blk-mq.c                    | 16 ++++++++--------
 drivers/block/mtip32xx/mtip32xx.c |  4 ++--
 drivers/block/null_blk.c          |  2 +-
 drivers/block/virtio_blk.c        |  2 +-
 drivers/scsi/scsi_lib.c           |  4 ++--
 include/linux/blk-mq.h            |  4 ++--
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 3cb5e9e..698e692 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -202,7 +202,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 		list_del_init(&rq->flush.list);
 		blk_flush_restore_request(rq);
 		if (q->mq_ops)
-			blk_mq_end_io(rq, error);
+			blk_mq_end_request(rq, error);
 		else
 			__blk_end_request_all(rq, error);
 		break;
@@ -378,7 +378,7 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if (!policy) {
 		if (q->mq_ops)
-			blk_mq_end_io(rq, 0);
+			blk_mq_end_request(rq, 0);
 		else
 			__blk_end_bidi_request(rq, 0, 0, 0);
 		return;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c3333ab..d1230ea 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -296,7 +296,7 @@ void blk_mq_clone_flush_request(struct request *flush_rq,
 		hctx->cmd_size);
 }
 
-inline void __blk_mq_end_io(struct request *rq, int error)
+inline void __blk_mq_end_request(struct request *rq, int error)
 {
 	blk_account_io_done(rq);
 
@@ -308,15 +308,15 @@ inline void __blk_mq_end_io(struct request *rq, int error)
 		blk_mq_free_request(rq);
 	}
 }
-EXPORT_SYMBOL(__blk_mq_end_io);
+EXPORT_SYMBOL(__blk_mq_end_request);
 
-void blk_mq_end_io(struct request *rq, int error)
+void blk_mq_end_request(struct request *rq, int error)
 {
 	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 		BUG();
-	__blk_mq_end_io(rq, error);
+	__blk_mq_end_request(rq, error);
 }
-EXPORT_SYMBOL(blk_mq_end_io);
+EXPORT_SYMBOL(blk_mq_end_request);
 
 static void __blk_mq_complete_request_remote(void *data)
 {
@@ -356,7 +356,7 @@ void __blk_mq_complete_request(struct request *rq)
 	struct request_queue *q = rq->q;
 
 	if (!q->softirq_done_fn)
-		blk_mq_end_io(rq, rq->errors);
+		blk_mq_end_request(rq, rq->errors);
 	else
 		blk_mq_ipi_complete_request(rq);
 }
@@ -744,7 +744,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 			pr_err("blk-mq: bad return on queue: %d\n", ret);
 		case BLK_MQ_RQ_QUEUE_ERROR:
 			rq->errors = -EIO;
-			blk_mq_end_io(rq, rq->errors);
+			blk_mq_end_request(rq, rq->errors);
 			break;
 		}
 
@@ -1191,7 +1191,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 			if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
 				rq->errors = -EIO;
-				blk_mq_end_io(rq, rq->errors);
+				blk_mq_end_request(rq, rq->errors);
 				goto done;
 			}
 		}
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 4042440..6b7e8d0 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -247,7 +247,7 @@ static void mtip_async_complete(struct mtip_port *port,
 	if (unlikely(cmd->unaligned))
 		up(&port->cmd_slot_unal);
 
-	blk_mq_end_io(rq, status ? -EIO : 0);
+	blk_mq_end_request(rq, status ? -EIO : 0);
 }
 
 /*
@@ -3739,7 +3739,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 		int err;
 
 		err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
-		blk_mq_end_io(rq, err);
+		blk_mq_end_request(rq, err);
 		return 0;
 	}
 
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 332ce20..ac50a29 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -177,7 +177,7 @@ static void end_cmd(struct nullb_cmd *cmd)
 {
 	switch (queue_mode)  {
 	case NULL_Q_MQ:
-		blk_mq_end_io(cmd->rq, 0);
+		blk_mq_end_request(cmd->rq, 0);
 		return;
 	case NULL_Q_RQ:
 		INIT_LIST_HEAD(&cmd->rq->queuelist);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 83816bf..f751fc3 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -135,7 +135,7 @@ static inline void virtblk_request_done(struct request *req)
 		req->errors = (error != 0);
 	}
 
-	blk_mq_end_io(req, error);
+	blk_mq_end_request(req, error);
 }
 
 static void virtblk_done(struct virtqueue *vq)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b06a355..e95adaa 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -713,7 +713,7 @@ static bool scsi_end_request(struct request *req, int error,
 
 	if (req->mq_ctx) {
 		/*
-		 * In the MQ case the command gets freed by __blk_mq_end_io,
+		 * In the MQ case the command gets freed by __blk_mq_end_request,
 		 * so we have to do all cleanup that depends on it earlier.
 		 *
 		 * We also can't kick the queues from irq context, so we
@@ -721,7 +721,7 @@ static bool scsi_end_request(struct request *req, int error,
 		 */
 		scsi_mq_uninit_cmd(cmd);
 
-		__blk_mq_end_io(req, error);
+		__blk_mq_end_request(req, error);
 
 		if (scsi_target(sdev)->single_lun ||
 		    !list_empty(&sdev->host->starved_list))
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 878b6f7..cb217c1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -160,8 +160,8 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_ind
 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
 
 void blk_mq_start_request(struct request *rq);
-void blk_mq_end_io(struct request *rq, int error);
-void __blk_mq_end_io(struct request *rq, int error);
+void blk_mq_end_request(struct request *rq, int error);
+void __blk_mq_end_request(struct request *rq, int error);
 
 void blk_mq_requeue_request(struct request *rq);
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/6] blk-mq: fix and simplify tag iteration for the timeout handler
  2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
                   ` (2 preceding siblings ...)
  2014-09-13 23:40 ` [PATCH 3/6] blk-mq: rename blk_mq_end_io to blk_mq_end_request Christoph Hellwig
@ 2014-09-13 23:40 ` Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 5/6] blk-mq: unshared " Christoph Hellwig
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2014-09-13 23:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Robert Elliott, linux-scsi, linux-kernel

Don't do a kmalloc from timer to handle timeouts, chances are we could be
under heavy load or similar and thus just miss out on the timeouts.
Fortunately it is very easy to just iterate over all in use tags, and doing
this properly actually cleans up the blk_mq_busy_iter API as well, and
prepares us for the next patch by passing a reserved argument to the
iterator.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c      | 46 +++++++++++---------------
 block/blk-mq.c          | 85 +++++++++++++++----------------------------------
 include/linux/blk-mq.h  |  6 +++-
 include/scsi/scsi_tcq.h |  2 +-
 4 files changed, 50 insertions(+), 89 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c1b9242..d6ea458 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -392,45 +392,37 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		__blk_mq_put_reserved_tag(tags, tag);
 }
 
-static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
-			     unsigned long *free_map, unsigned int off)
+static void bt_for_each(struct blk_mq_hw_ctx *hctx,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_iter_fn *fn, void *data, bool reserved)
 {
-	int i;
+	struct request *rq;
+	int bit, i;
 
 	for (i = 0; i < bt->map_nr; i++) {
 		struct blk_align_bitmap *bm = &bt->map[i];
-		int bit = 0;
-
-		do {
-			bit = find_next_zero_bit(&bm->word, bm->depth, bit);
-			if (bit >= bm->depth)
-				break;
 
-			__set_bit(bit + off, free_map);
-			bit++;
-		} while (1);
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+		     	rq = blk_mq_tag_to_rq(hctx->tags, off + bit);
+			if (rq->q == hctx->queue)
+				fn(hctx, rq, data, reserved);
+		}
 
 		off += (1 << bt->bits_per_word);
 	}
 }
 
-void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
-			  void (*fn)(void *, unsigned long *), void *data)
+void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+		void *priv)
 {
-	unsigned long *tag_map;
-	size_t map_size;
-
-	map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
-	tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
-	if (!tag_map)
-		return;
-
-	bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
-	if (tags->nr_reserved_tags)
-		bt_for_each_free(&tags->breserved_tags, tag_map, 0);
+	struct blk_mq_tags *tags = hctx->tags;
 
-	fn(data, tag_map);
-	kfree(tag_map);
+ 	if (tags->nr_reserved_tags)
+		bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
+	bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
 }
 EXPORT_SYMBOL(blk_mq_tag_busy_iter);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d1230ea..2d96b0d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -511,58 +511,6 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-struct blk_mq_timeout_data {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long *next;
-	unsigned int *next_set;
-};
-
-static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
-{
-	struct blk_mq_timeout_data *data = __data;
-	struct blk_mq_hw_ctx *hctx = data->hctx;
-	unsigned int tag;
-
-	 /* It may not be in flight yet (this is where
-	 * the REQ_ATOMIC_STARTED flag comes in). The requests are
-	 * statically allocated, so we know it's always safe to access the
-	 * memory associated with a bit offset into ->rqs[].
-	 */
-	tag = 0;
-	do {
-		struct request *rq;
-
-		tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
-		if (tag >= hctx->tags->nr_tags)
-			break;
-
-		rq = blk_mq_tag_to_rq(hctx->tags, tag++);
-		if (rq->q != hctx->queue)
-			continue;
-		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-			continue;
-
-		blk_rq_check_expired(rq, data->next, data->next_set);
-	} while (1);
-}
-
-static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
-					unsigned long *next,
-					unsigned int *next_set)
-{
-	struct blk_mq_timeout_data data = {
-		.hctx		= hctx,
-		.next		= next,
-		.next_set	= next_set,
-	};
-
-	/*
-	 * Ask the tagging code to iterate busy requests, so we can
-	 * check them for timeout.
-	 */
-	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
-}
-
 static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
 {
 	struct request_queue *q = rq->q;
@@ -584,13 +532,30 @@ static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
 
 	return q->mq_ops->timeout(rq);
 }
+		
+struct blk_mq_timeout_data {
+	unsigned long next;
+	unsigned int next_set;
+};
+
+static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, void *priv, bool reserved)
+{
+	struct blk_mq_timeout_data *data = priv;
+
+	if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+		blk_rq_check_expired(rq, &data->next, &data->next_set);
+}
 
-static void blk_mq_rq_timer(unsigned long data)
+static void blk_mq_rq_timer(unsigned long priv)
 {
-	struct request_queue *q = (struct request_queue *) data;
+	struct request_queue *q = (struct request_queue *)priv;
+	struct blk_mq_timeout_data data = {
+		.next		= 0,
+		.next_set	= 0,
+	};
 	struct blk_mq_hw_ctx *hctx;
-	unsigned long next = 0;
-	int i, next_set = 0;
+	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		/*
@@ -600,12 +565,12 @@ static void blk_mq_rq_timer(unsigned long data)
 		if (!hctx->nr_ctx || !hctx->tags)
 			continue;
 
-		blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+		blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
 	}
 
-	if (next_set) {
-		next = blk_rq_timeout(round_jiffies_up(next));
-		mod_timer(&q->timeout, next);
+	if (data.next_set) {
+		data.next = blk_rq_timeout(round_jiffies_up(data.next));
+		mod_timer(&q->timeout, data.next);
 	} else {
 		queue_for_each_hw_ctx(q, hctx, i)
 			blk_mq_tag_idle(hctx);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index cb217c1..0eb0f64 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -86,6 +86,9 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int,
 typedef void (exit_request_fn)(void *, struct request *, unsigned int,
 		unsigned int);
 
+typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
+		bool);
+
 struct blk_mq_ops {
 	/*
 	 * Queue request
@@ -174,7 +177,8 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+		void *priv);
 
 /*
  * Driver command data is immediately after the request. So subtract request
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index cdcc90b..e645835 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -68,7 +68,7 @@ static inline void scsi_activate_tcq(struct scsi_device *sdev, int depth)
 		return;
 
 	if (!shost_use_blk_mq(sdev->host) &&
-	    blk_queue_tagged(sdev->request_queue))
+	    !blk_queue_tagged(sdev->request_queue))
 		blk_queue_init_tags(sdev->request_queue, depth,
 				    sdev->host->bqt);
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 5/6] blk-mq: unshared timeout handler
  2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
                   ` (3 preceding siblings ...)
  2014-09-13 23:40 ` [PATCH 4/6] blk-mq: fix and simplify tag iteration for the timeout handler Christoph Hellwig
@ 2014-09-13 23:40 ` Christoph Hellwig
  2014-09-13 23:40 ` [PATCH 6/6] blk-mq: pass a reserved argument to the " Christoph Hellwig
  2014-09-17 21:53 ` blk-mq timeout handling fixes Elliott, Robert (Server Storage)
  6 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2014-09-13 23:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Robert Elliott, linux-scsi, linux-kernel

Duplicate the (small) timeout handler in blk-mq so that we can pass
arguments more easily to the driver timeout handler.  This enables
the next patch.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c      | 53 +++++++++++++++++++++++++++++++++++++----------------
 block/blk-timeout.c |  8 ++------
 block/blk.h         |  2 --
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2d96b0d..c14d397 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -511,9 +511,15 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
+struct blk_mq_timeout_data {
+	unsigned long next;
+	unsigned int next_set;
+};
+
+static void blk_mq_rq_timed_out(struct request *req)
 {
-	struct request_queue *q = rq->q;
+	struct blk_mq_ops *ops = req->q->mq_ops;
+	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
 	/*
 	 * We know that complete is set at this point. If STARTED isn't set
@@ -524,27 +530,43 @@ static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
 	 * we both flags will get cleared. So check here again, and ignore
 	 * a timeout event with a request that isn't active.
 	 */
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-		return BLK_EH_NOT_HANDLED;
-
-	if (!q->mq_ops->timeout)
-		return BLK_EH_RESET_TIMER;
+	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
+		return;
 
-	return q->mq_ops->timeout(rq);
+	if (ops->timeout)
+		ret = ops->timeout(req);
+
+	switch (ret) {
+	case BLK_EH_HANDLED:
+		__blk_mq_complete_request(req);
+		break;
+	case BLK_EH_RESET_TIMER:
+		blk_add_timer(req);
+		blk_clear_rq_complete(req);
+		break;
+	case BLK_EH_NOT_HANDLED:
+		break;
+	default:
+		printk(KERN_ERR "block: bad eh return: %d\n", ret);
+		break;
+	}
 }
 		
-struct blk_mq_timeout_data {
-	unsigned long next;
-	unsigned int next_set;
-};
-
 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	struct blk_mq_timeout_data *data = priv;
 
-	if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-		blk_rq_check_expired(rq, &data->next, &data->next_set);
+	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+		return;
+
+	if (time_after_eq(jiffies, rq->deadline)) {
+		if (!blk_mark_rq_complete(rq))
+			blk_mq_rq_timed_out(rq);
+	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
+		data->next = rq->deadline;
+		data->next_set = 1;
+	}
 }
 
 static void blk_mq_rq_timer(unsigned long priv)
@@ -1770,7 +1792,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	else
 		blk_queue_make_request(q, blk_sq_make_request);
 
-	blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
 	if (set->timeout)
 		blk_queue_rq_timeout(q, set->timeout);
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 95a0959..4d44825 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -7,7 +7,6 @@
 #include <linux/fault-inject.h>
 
 #include "blk.h"
-#include "blk-mq.h"
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 
@@ -90,10 +89,7 @@ static void blk_rq_timed_out(struct request *req)
 	switch (ret) {
 	case BLK_EH_HANDLED:
 		/* Can we use req->errors here? */
-		if (q->mq_ops)
-			__blk_mq_complete_request(req);
-		else
-			__blk_complete_request(req);
+		__blk_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
@@ -113,7 +109,7 @@ static void blk_rq_timed_out(struct request *req)
 	}
 }
 
-void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
+static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
 			  unsigned int *next_set)
 {
 	if (time_after_eq(jiffies, rq->deadline)) {
diff --git a/block/blk.h b/block/blk.h
index 6748c4f..e515a28 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -38,8 +38,6 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 			    unsigned int nr_bytes, unsigned int bidi_bytes);
 
 void blk_rq_timed_out_timer(unsigned long data);
-void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
-			  unsigned int *next_set);
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 6/6] blk-mq: pass a reserved argument to the timeout handler
  2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
                   ` (4 preceding siblings ...)
  2014-09-13 23:40 ` [PATCH 5/6] blk-mq: unshared " Christoph Hellwig
@ 2014-09-13 23:40 ` Christoph Hellwig
  2014-09-17 21:53 ` blk-mq timeout handling fixes Elliott, Robert (Server Storage)
  6 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2014-09-13 23:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Robert Elliott, linux-scsi, linux-kernel

Allow blk-mq to pass an argument to the timeout handler to indicate
if we're timing out a reserved or regular command.  For many drivers
those need to be handled different.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c          |  6 +++---
 drivers/scsi/scsi_lib.c | 10 +++++++++-
 include/linux/blk-mq.h  |  3 ++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c14d397..85eb540 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -516,7 +516,7 @@ struct blk_mq_timeout_data {
 	unsigned int next_set;
 };
 
-static void blk_mq_rq_timed_out(struct request *req)
+static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
 	struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
@@ -534,7 +534,7 @@ static void blk_mq_rq_timed_out(struct request *req)
 		return;
 
 	if (ops->timeout)
-		ret = ops->timeout(req);
+		ret = ops->timeout(req, reserved);
 
 	switch (ret) {
 	case BLK_EH_HANDLED:
@@ -562,7 +562,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 
 	if (time_after_eq(jiffies, rq->deadline)) {
 		if (!blk_mark_rq_complete(rq))
-			blk_mq_rq_timed_out(rq);
+			blk_mq_rq_timed_out(rq, reserved);
 	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
 		data->next = rq->deadline;
 		data->next_set = 1;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e95adaa..dc01ab2 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1932,6 +1932,14 @@ out:
 	return ret;
 }
 
+static enum blk_eh_timer_return scsi_timeout(struct request *req,
+		bool reserved)
+{
+	if (reserved)
+		return BLK_EH_RESET_TIMER;
+	return scsi_times_out(req);
+}
+
 static int scsi_init_request(void *data, struct request *rq,
 		unsigned int hctx_idx, unsigned int request_idx,
 		unsigned int numa_node)
@@ -2043,7 +2051,7 @@ static struct blk_mq_ops scsi_mq_ops = {
 	.map_queue	= blk_mq_map_queue,
 	.queue_rq	= scsi_queue_rq,
 	.complete	= scsi_softirq_done,
-	.timeout	= scsi_times_out,
+	.timeout	= scsi_timeout,
 	.init_request	= scsi_init_request,
 	.exit_request	= scsi_exit_request,
 };
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0eb0f64..3253495 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -79,6 +79,7 @@ struct blk_mq_tag_set {
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
+typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_request_fn)(void *, struct request *, unsigned int,
@@ -103,7 +104,7 @@ struct blk_mq_ops {
 	/*
 	 * Called on request timeout
 	 */
-	rq_timed_out_fn		*timeout;
+	timeout_fn		*timeout;
 
 	softirq_done_fn		*complete;
 
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq
  2014-09-13 23:40 ` [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq Christoph Hellwig
@ 2014-09-15  6:34   ` Ming Lei
  2014-09-15  7:27   ` Ming Lei
  1 sibling, 0 replies; 11+ messages in thread
From: Ming Lei @ 2014-09-15  6:34 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Robert Elliott, Linux SCSI List, Linux Kernel Mailing List

On Sun, Sep 14, 2014 at 7:40 AM, Christoph Hellwig <hch@lst.de> wrote:
> When we call blk_mq_start_request from the core blk-mq code before calling into
> ->queue_rq there is a racy window where the timeout handler can hit before we've
> fully set up the driver specific part of the command.

It is quite difficult to happen since the default timeout is 30s, which
is long enough.

Suppose it happens, what is the matter without setting up request's pdu
since the request can only be completed one time thanks to flag of
REQ_ATOM_COMPLETE?

>
> Move the call to blk_mq_start_request into the driver so the driver can start
> the request only once it is fully set up.

The change isn't friendly for drivers since every drivers need to do that and
it should have been done in blk-mq core code.

Also the timeout handler still may see pdu of request not setup because
writing rq in blk_mq_start_request() and writing on pdu may be reordered.

>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-mq.c                    | 13 ++++++-------
>  drivers/block/mtip32xx/mtip32xx.c |  2 ++
>  drivers/block/null_blk.c          |  2 ++
>  drivers/block/virtio_blk.c        |  2 ++
>  drivers/scsi/scsi_lib.c           |  1 +
>  include/linux/blk-mq.h            |  1 +
>  6 files changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 42c94c8..c3333ab 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -380,7 +380,7 @@ void blk_mq_complete_request(struct request *rq)
>  }
>  EXPORT_SYMBOL(blk_mq_complete_request);
>
> -static void blk_mq_start_request(struct request *rq)
> +void blk_mq_start_request(struct request *rq)
>  {
>         struct request_queue *q = rq->q;
>
> @@ -412,16 +412,18 @@ static void blk_mq_start_request(struct request *rq)
>                 rq->nr_phys_segments++;
>         }
>  }
> +EXPORT_SYMBOL(blk_mq_start_request);
>
>  static void __blk_mq_requeue_request(struct request *rq)
>  {
>         struct request_queue *q = rq->q;
>
>         trace_block_rq_requeue(q, rq);
> -       clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
>
> -       if (q->dma_drain_size && blk_rq_bytes(rq))
> -               rq->nr_phys_segments--;
> +       if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
> +               if (q->dma_drain_size && blk_rq_bytes(rq))
> +                       rq->nr_phys_segments--;
> +       }
>  }
>
>  void blk_mq_requeue_request(struct request *rq)
> @@ -729,8 +731,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>                 rq = list_first_entry(&rq_list, struct request, queuelist);
>                 list_del_init(&rq->queuelist);
>
> -               blk_mq_start_request(rq);
> -
>                 ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
>                 switch (ret) {
>                 case BLK_MQ_RQ_QUEUE_OK:
> @@ -1177,7 +1177,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
>                 int ret;
>
>                 blk_mq_bio_to_request(rq, bio);
> -               blk_mq_start_request(rq);
>
>                 /*
>                  * For OK queue, we are done. For error, kill it. Any other
> diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
> index 0e2084f..4042440 100644
> --- a/drivers/block/mtip32xx/mtip32xx.c
> +++ b/drivers/block/mtip32xx/mtip32xx.c
> @@ -3783,6 +3783,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
>         if (unlikely(mtip_check_unal_depth(hctx, rq)))
>                 return BLK_MQ_RQ_QUEUE_BUSY;
>
> +       blk_mq_start_request(rq);
> +
>         ret = mtip_submit_request(hctx, rq);
>         if (likely(!ret))
>                 return BLK_MQ_RQ_QUEUE_OK;
> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
> index c5b7315..332ce20 100644
> --- a/drivers/block/null_blk.c
> +++ b/drivers/block/null_blk.c
> @@ -321,6 +321,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
>         cmd->rq = rq;
>         cmd->nq = hctx->driver_data;
>
> +       blk_mq_start_request(rq);
> +
>         null_handle_cmd(cmd);
>         return BLK_MQ_RQ_QUEUE_OK;
>  }
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 13756e0..83816bf 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -205,6 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
>                 }
>         }
>
> +       blk_mq_start_request(req);
> +
>         num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
>         if (num) {
>                 if (rq_data_dir(vbr->req) == WRITE)
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 1ec00ba..b06a355 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1890,6 +1890,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
>         scsi_init_cmd_errh(cmd);
>         cmd->scsi_done = scsi_mq_done;
>
> +       blk_mq_start_request(req);
>         reason = scsi_dispatch_cmd(cmd);
>         if (reason) {
>                 scsi_set_blocked(cmd, reason);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 9c4e306..878b6f7 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -159,6 +159,7 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
>  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
>  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
>
> +void blk_mq_start_request(struct request *rq);
>  void blk_mq_end_io(struct request *rq, int error);
>  void __blk_mq_end_io(struct request *rq, int error);
>
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



-- 
Ming Lei

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq
  2014-09-13 23:40 ` [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq Christoph Hellwig
  2014-09-15  6:34   ` Ming Lei
@ 2014-09-15  7:27   ` Ming Lei
  1 sibling, 0 replies; 11+ messages in thread
From: Ming Lei @ 2014-09-15  7:27 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Robert Elliott, Linux SCSI List, Linux Kernel Mailing List

On Sun, Sep 14, 2014 at 7:40 AM, Christoph Hellwig <hch@lst.de> wrote:
> When we call blk_mq_start_request from the core blk-mq code before calling into
> ->queue_rq there is a racy window where the timeout handler can hit before we've
> fully set up the driver specific part of the command.

One problem I thought of is that writing on rq->deadline and writing on
the following two atomic flags can be reordered, then timeout may hit on this
req too early because the previous deadline of the rq can be read in
blk_rq_check_expired().

That said an explicit memory barrier is required in blk_mq_start_request().

>
> Move the call to blk_mq_start_request into the driver so the driver can start
> the request only once it is fully set up.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-mq.c                    | 13 ++++++-------
>  drivers/block/mtip32xx/mtip32xx.c |  2 ++
>  drivers/block/null_blk.c          |  2 ++
>  drivers/block/virtio_blk.c        |  2 ++
>  drivers/scsi/scsi_lib.c           |  1 +
>  include/linux/blk-mq.h            |  1 +
>  6 files changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 42c94c8..c3333ab 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -380,7 +380,7 @@ void blk_mq_complete_request(struct request *rq)
>  }
>  EXPORT_SYMBOL(blk_mq_complete_request);
>
> -static void blk_mq_start_request(struct request *rq)
> +void blk_mq_start_request(struct request *rq)
>  {
>         struct request_queue *q = rq->q;
>
> @@ -412,16 +412,18 @@ static void blk_mq_start_request(struct request *rq)
>                 rq->nr_phys_segments++;
>         }
>  }
> +EXPORT_SYMBOL(blk_mq_start_request);
>
>  static void __blk_mq_requeue_request(struct request *rq)
>  {
>         struct request_queue *q = rq->q;
>
>         trace_block_rq_requeue(q, rq);
> -       clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
>
> -       if (q->dma_drain_size && blk_rq_bytes(rq))
> -               rq->nr_phys_segments--;
> +       if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
> +               if (q->dma_drain_size && blk_rq_bytes(rq))
> +                       rq->nr_phys_segments--;
> +       }
>  }
>
>  void blk_mq_requeue_request(struct request *rq)
> @@ -729,8 +731,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>                 rq = list_first_entry(&rq_list, struct request, queuelist);
>                 list_del_init(&rq->queuelist);
>
> -               blk_mq_start_request(rq);
> -
>                 ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
>                 switch (ret) {
>                 case BLK_MQ_RQ_QUEUE_OK:
> @@ -1177,7 +1177,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
>                 int ret;
>
>                 blk_mq_bio_to_request(rq, bio);
> -               blk_mq_start_request(rq);
>
>                 /*
>                  * For OK queue, we are done. For error, kill it. Any other
> diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
> index 0e2084f..4042440 100644
> --- a/drivers/block/mtip32xx/mtip32xx.c
> +++ b/drivers/block/mtip32xx/mtip32xx.c
> @@ -3783,6 +3783,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
>         if (unlikely(mtip_check_unal_depth(hctx, rq)))
>                 return BLK_MQ_RQ_QUEUE_BUSY;
>
> +       blk_mq_start_request(rq);
> +
>         ret = mtip_submit_request(hctx, rq);
>         if (likely(!ret))
>                 return BLK_MQ_RQ_QUEUE_OK;
> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
> index c5b7315..332ce20 100644
> --- a/drivers/block/null_blk.c
> +++ b/drivers/block/null_blk.c
> @@ -321,6 +321,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
>         cmd->rq = rq;
>         cmd->nq = hctx->driver_data;
>
> +       blk_mq_start_request(rq);
> +
>         null_handle_cmd(cmd);
>         return BLK_MQ_RQ_QUEUE_OK;
>  }
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 13756e0..83816bf 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -205,6 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
>                 }
>         }
>
> +       blk_mq_start_request(req);
> +
>         num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
>         if (num) {
>                 if (rq_data_dir(vbr->req) == WRITE)
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 1ec00ba..b06a355 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1890,6 +1890,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
>         scsi_init_cmd_errh(cmd);
>         cmd->scsi_done = scsi_mq_done;
>
> +       blk_mq_start_request(req);
>         reason = scsi_dispatch_cmd(cmd);
>         if (reason) {
>                 scsi_set_blocked(cmd, reason);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 9c4e306..878b6f7 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -159,6 +159,7 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
>  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
>  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
>
> +void blk_mq_start_request(struct request *rq);
>  void blk_mq_end_io(struct request *rq, int error);
>  void __blk_mq_end_io(struct request *rq, int error);
>
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/



-- 
Ming Lei

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: blk-mq timeout handling fixes
  2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
                   ` (5 preceding siblings ...)
  2014-09-13 23:40 ` [PATCH 6/6] blk-mq: pass a reserved argument to the " Christoph Hellwig
@ 2014-09-17 21:53 ` Elliott, Robert (Server Storage)
  2014-09-17 21:56   ` Jens Axboe
  6 siblings, 1 reply; 11+ messages in thread
From: Elliott, Robert (Server Storage) @ 2014-09-17 21:53 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe; +Cc: linux-scsi, linux-kernel



> -----Original Message-----
> From: Christoph Hellwig [mailto:hch@lst.de]
> Sent: Saturday, 13 September, 2014 6:40 PM
> To: Jens Axboe
> Cc: Elliott, Robert (Server Storage); linux-scsi@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: blk-mq timeout handling fixes
> 
> This series fixes various issues with timeout handling that Robert
> ran into when testing scsi-mq heavily.  He tested an earlier version,
> and couldn't reproduce the issues anymore, although the series changed
> quite significantly since and should probably be retested.
> 
> In summary we not only start the blk-mq timer inside the drivers
> ->queue_rq method after the request has been fully setup, and we
> also tell the drivers if we're timing out a reserved (internal)
> request or a real one.  Many drivers including will need to handle
> those internal ones differently, e.g. for scsi-mq we don't even
> have a scsi command structure allocated for the reserved commands.

I have rerun a variety of tests on:
* Jens' for-next tree that went into 3.17rc5
* plus this series
* plus two patches for infinite recursion on flushes from 
  Ming and then Christoph

and have not been able to trigger the scsi_times_out req->special
NULL pointer dereference that prompted this series.

Testing includes:
* concurrent heavy workload generators:
  * fio high iodepth direct 512 byte random reads (> 1M IOPS)
  * programs generating large bursts of paged writes
    * mkfs.ext4 (followed by e2fsck)
    * mkfs.xfs (followed by xfs_check)
    * ddpt
  * watch -n 0 sync to generate flushes
* scsi_logging_level MLCOMPLETE set to 0 or 1
  * scsi_lib.c patched to put all the ACTION_FAIL messages
    under level 1 so they can be squelched (massive error 
    prints cause more timeouts themselves)
* 4 hpsa and 16 mpt3sas devices (all made from SAS SSDs)
  * lockless hpsa driver
* injecting errors
  * device removal
  * device generating infinite errors
  * device generating a brief number of errors

The filesystems don't always recover properly, but nothing in 
the block or scsi midlayers crashed.

So, you may add this to the series:
Tested-by: Robert Elliott <elliott@hp.com>

---
Rob Elliott    HP Server Storage





^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: blk-mq timeout handling fixes
  2014-09-17 21:53 ` blk-mq timeout handling fixes Elliott, Robert (Server Storage)
@ 2014-09-17 21:56   ` Jens Axboe
  0 siblings, 0 replies; 11+ messages in thread
From: Jens Axboe @ 2014-09-17 21:56 UTC (permalink / raw)
  To: Elliott, Robert (Server Storage), Christoph Hellwig
  Cc: linux-scsi, linux-kernel

On 09/17/2014 03:53 PM, Elliott, Robert (Server Storage) wrote:
> 
> 
>> -----Original Message-----
>> From: Christoph Hellwig [mailto:hch@lst.de]
>> Sent: Saturday, 13 September, 2014 6:40 PM
>> To: Jens Axboe
>> Cc: Elliott, Robert (Server Storage); linux-scsi@vger.kernel.org; linux-
>> kernel@vger.kernel.org
>> Subject: blk-mq timeout handling fixes
>>
>> This series fixes various issues with timeout handling that Robert
>> ran into when testing scsi-mq heavily.  He tested an earlier version,
>> and couldn't reproduce the issues anymore, although the series changed
>> quite significantly since and should probably be retested.
>>
>> In summary we not only start the blk-mq timer inside the drivers
>> ->queue_rq method after the request has been fully setup, and we
>> also tell the drivers if we're timing out a reserved (internal)
>> request or a real one.  Many drivers including will need to handle
>> those internal ones differently, e.g. for scsi-mq we don't even
>> have a scsi command structure allocated for the reserved commands.
> 
> I have rerun a variety of tests on:
> * Jens' for-next tree that went into 3.17rc5
> * plus this series
> * plus two patches for infinite recursion on flushes from 
>   Ming and then Christoph

This is pretty much what is queued up for 3.17 as well. It's bigger than
I'd like at this point, but these are real fixes.

> and have not been able to trigger the scsi_times_out req->special
> NULL pointer dereference that prompted this series.

Great!!

> Testing includes:
> * concurrent heavy workload generators:
>   * fio high iodepth direct 512 byte random reads (> 1M IOPS)
>   * programs generating large bursts of paged writes
>     * mkfs.ext4 (followed by e2fsck)
>     * mkfs.xfs (followed by xfs_check)
>     * ddpt
>   * watch -n 0 sync to generate flushes
> * scsi_logging_level MLCOMPLETE set to 0 or 1
>   * scsi_lib.c patched to put all the ACTION_FAIL messages
>     under level 1 so they can be squelched (massive error 
>     prints cause more timeouts themselves)
> * 4 hpsa and 16 mpt3sas devices (all made from SAS SSDs)
>   * lockless hpsa driver
> * injecting errors
>   * device removal
>   * device generating infinite errors
>   * device generating a brief number of errors
> 
> The filesystems don't always recover properly, but nothing in 
> the block or scsi midlayers crashed.
> 
> So, you may add this to the series:
> Tested-by: Robert Elliott <elliott@hp.com>

Thanks a lot for your (continued) testing, Robert. It's a great help.


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2014-09-17 21:56 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-13 23:40 blk-mq timeout handling fixes Christoph Hellwig
2014-09-13 23:40 ` [PATCH 1/6] blk-mq: remove REQ_END Christoph Hellwig
2014-09-13 23:40 ` [PATCH 2/6] blk-mq: call blk_mq_start_request from ->queue_rq Christoph Hellwig
2014-09-15  6:34   ` Ming Lei
2014-09-15  7:27   ` Ming Lei
2014-09-13 23:40 ` [PATCH 3/6] blk-mq: rename blk_mq_end_io to blk_mq_end_request Christoph Hellwig
2014-09-13 23:40 ` [PATCH 4/6] blk-mq: fix and simplify tag iteration for the timeout handler Christoph Hellwig
2014-09-13 23:40 ` [PATCH 5/6] blk-mq: unshared " Christoph Hellwig
2014-09-13 23:40 ` [PATCH 6/6] blk-mq: pass a reserved argument to the " Christoph Hellwig
2014-09-17 21:53 ` blk-mq timeout handling fixes Elliott, Robert (Server Storage)
2014-09-17 21:56   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.