All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/1] block: rework flush sequencing for blk-mq
@ 2014-01-30 13:26 Christoph Hellwig
  2014-01-30 13:26 ` [PATCH 1/1] " Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2014-01-30 13:26 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Shaohua Li, linux-kernel

This patch aims to make the flush sequence for blk-mq work the same
as for the old request path, that is set a special request aside
for the sequenced flushes, which only gets submitted while the parent
request(s) are blocked.  It kinda reverts two earlier patches from me
and Shaohua which tried to hack around the issues we had before.

There's still a few difference from the legacy flush code, the first
one is that we have to submit blk-mq requests from user context and
need a workqueue for it.  For now the code for that is kept in the flush
code, but once blk-mq grows requeueing code as needed e.g. for a proper
SCSI conversion it might get time to life that to common code.  The
second one is that the old code has a way to defer flushes depending on
queue busy status (the flush_queue_delayed flag), but I have no idea
how to create something similar for blk-mq.

Last but not least it might make sense to allocate the special flush
request per hardware context and on the right node, but I'd like to
defer that until someone has actual high performance hardware that
requires flushes, before that the law of premature optimization applies.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-01-30 13:26 [PATCH 0/1] block: rework flush sequencing for blk-mq Christoph Hellwig
@ 2014-01-30 13:26 ` Christoph Hellwig
  2014-02-07  1:18   ` Shaohua Li
  2014-03-07 20:45   ` Jeff Moyer
  0 siblings, 2 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-01-30 13:26 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Shaohua Li, linux-kernel

Witch to using a preallocated flush_rq for blk-mq similar to what's done
with the old request path.  This allows us to set up the request properly
with a tag from the actually allowed range and ->rq_disk as needed by
some drivers.  To make life easier we also switch to dynamic allocation
of ->flush_rq for the old path.

This effectively reverts most of

    "blk-mq: fix for flush deadlock"

and

    "blk-mq: Don't reserve a tag for flush request"

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c       |   15 +++++--
 block/blk-flush.c      |  105 ++++++++++++++++++------------------------------
 block/blk-mq.c         |   54 +++++++++----------------
 block/blk-mq.h         |    1 +
 block/blk-sysfs.c      |    2 +
 include/linux/blk-mq.h |    5 +--
 include/linux/blkdev.h |   11 ++---
 7 files changed, 76 insertions(+), 117 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c00e0bd..d3eb330 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	if (!uninit_q)
 		return NULL;
 
+	uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
+	if (!uninit_q->flush_rq)
+		goto out_cleanup_queue;
+
 	q = blk_init_allocated_queue(uninit_q, rfn, lock);
 	if (!q)
-		blk_cleanup_queue(uninit_q);
-
+		goto out_free_flush_rq;
 	return q;
+
+out_free_flush_rq:
+	kfree(uninit_q->flush_rq);
+out_cleanup_queue:
+	blk_cleanup_queue(uninit_q);
+	return NULL;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 
@@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw, gfp_mask, false);
+		return blk_mq_alloc_request(q, rw, gfp_mask);
 	else
 		return blk_old_get_request(q, rw, gfp_mask);
 }
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9143e85..66e2b69 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,20 +130,26 @@ static void blk_flush_restore_request(struct request *rq)
 	blk_clear_rq_complete(rq);
 }
 
-static void mq_flush_data_run(struct work_struct *work)
+static void mq_flush_run(struct work_struct *work)
 {
 	struct request *rq;
 
-	rq = container_of(work, struct request, mq_flush_data);
+	rq = container_of(work, struct request, mq_flush_work);
 
 	memset(&rq->csd, 0, sizeof(rq->csd));
 	blk_mq_run_request(rq, true, false);
 }
 
-static void blk_mq_flush_data_insert(struct request *rq)
+static bool blk_flush_queue_rq(struct request *rq)
 {
-	INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
-	kblockd_schedule_work(rq->q, &rq->mq_flush_data);
+	if (rq->q->mq_ops) {
+		INIT_WORK(&rq->mq_flush_work, mq_flush_run);
+		kblockd_schedule_work(rq->q, &rq->mq_flush_work);
+		return false;
+	} else {
+		list_add_tail(&rq->queuelist, &rq->q->queue_head);
+		return true;
+	}
 }
 
 /**
@@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
-		if (q->mq_ops)
-			blk_mq_flush_data_insert(rq);
-		else {
-			list_add(&rq->queuelist, &q->queue_head);
-			queued = true;
-		}
+		queued = blk_flush_queue_rq(rq);
 		break;
 
 	case REQ_FSEQ_DONE:
@@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 	}
 
 	kicked = blk_kick_flush(q);
-	/* blk_mq_run_flush will run queue */
-	if (q->mq_ops)
-		return queued;
 	return kicked | queued;
 }
 
@@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error)
 	struct request *rq, *n;
 	unsigned long flags = 0;
 
-	if (q->mq_ops) {
-		blk_mq_free_request(flush_rq);
+	if (q->mq_ops)
 		spin_lock_irqsave(&q->mq_flush_lock, flags);
-	}
+
 	running = &q->flush_queue[q->flush_running_idx];
 	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
 
@@ -263,48 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error)
 	 * kblockd.
 	 */
 	if (queued || q->flush_queue_delayed) {
-		if (!q->mq_ops)
-			blk_run_queue_async(q);
-		else
-		/*
-		 * This can be optimized to only run queues with requests
-		 * queued if necessary.
-		 */
-			blk_mq_run_queues(q, true);
+		WARN_ON(q->mq_ops);
+		blk_run_queue_async(q);
 	}
 	q->flush_queue_delayed = 0;
 	if (q->mq_ops)
 		spin_unlock_irqrestore(&q->mq_flush_lock, flags);
 }
 
-static void mq_flush_work(struct work_struct *work)
-{
-	struct request_queue *q;
-	struct request *rq;
-
-	q = container_of(work, struct request_queue, mq_flush_work);
-
-	rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
-		__GFP_WAIT|GFP_ATOMIC, false);
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->end_io = flush_end_io;
-
-	blk_mq_run_request(rq, true, false);
-}
-
-/*
- * We can't directly use q->flush_rq, because it doesn't have tag and is not in
- * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
- * so offload the work to workqueue.
- *
- * Note: we assume a flush request finished in any hardware queue will flush
- * the whole disk cache.
- */
-static void mq_run_flush(struct request_queue *q)
-{
-	kblockd_schedule_work(q, &q->mq_flush_work);
-}
-
 /**
  * blk_kick_flush - consider issuing flush request
  * @q: request_queue being kicked
@@ -339,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q)
 	 * different from running_idx, which means flush is in flight.
 	 */
 	q->flush_pending_idx ^= 1;
+
 	if (q->mq_ops) {
-		mq_run_flush(q);
-		return true;
+		struct blk_mq_ctx *ctx = first_rq->mq_ctx;
+		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+		blk_mq_rq_init(hctx, q->flush_rq);
+		q->flush_rq->mq_ctx = ctx;
+
+		/*
+		 * Reuse the tag value from the fist waiting request,
+		 * with blk-mq the tag is generated during request
+		 * allocation and drivers can rely on it being inside
+		 * the range they asked for.
+		 */
+		q->flush_rq->tag = first_rq->tag;
+	} else {
+		blk_rq_init(q, q->flush_rq);
 	}
 
-	blk_rq_init(q, &q->flush_rq);
-	q->flush_rq.cmd_type = REQ_TYPE_FS;
-	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
-	q->flush_rq.rq_disk = first_rq->rq_disk;
-	q->flush_rq.end_io = flush_end_io;
+	q->flush_rq->cmd_type = REQ_TYPE_FS;
+	q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+	q->flush_rq->rq_disk = first_rq->rq_disk;
+	q->flush_rq->end_io = flush_end_io;
 
-	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
-	return true;
+	return blk_flush_queue_rq(q->flush_rq);
 }
 
 static void flush_data_end_io(struct request *rq, int error)
@@ -407,11 +382,8 @@ void blk_insert_flush(struct request *rq)
 	/*
 	 * @policy now records what operations need to be done.  Adjust
 	 * REQ_FLUSH and FUA for the driver.
-	 * We keep REQ_FLUSH for mq to track flush requests. For !FUA,
-	 * we never dispatch the request directly.
 	 */
-	if (rq->cmd_flags & REQ_FUA)
-		rq->cmd_flags &= ~REQ_FLUSH;
+	rq->cmd_flags &= ~REQ_FLUSH;
 	if (!(fflags & REQ_FUA))
 		rq->cmd_flags &= ~REQ_FUA;
 
@@ -560,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush);
 void blk_mq_init_flush(struct request_queue *q)
 {
 	spin_lock_init(&q->mq_flush_lock);
-	INIT_WORK(&q->mq_flush_work, mq_flush_work);
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9072d0a..5c3073f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -194,27 +194,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 }
 
 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
-					      gfp_t gfp, bool reserved,
-					      int rw)
+					      gfp_t gfp, bool reserved)
 {
-	struct request *req;
-	bool is_flush = false;
-	/*
-	 * flush need allocate a request, leave at least one request for
-	 * non-flush IO to avoid deadlock
-	 */
-	if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) {
-		if (atomic_inc_return(&hctx->pending_flush) >=
-		    hctx->queue_depth - hctx->reserved_tags - 1) {
-			atomic_dec(&hctx->pending_flush);
-			return NULL;
-		}
-		is_flush = true;
-	}
-	req = blk_mq_alloc_rq(hctx, gfp, reserved);
-	if (!req && is_flush)
-		atomic_dec(&hctx->pending_flush);
-	return req;
+	return blk_mq_alloc_rq(hctx, gfp, reserved);
 }
 
 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
@@ -227,7 +209,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
-		rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw);
+		rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
 		if (rq) {
 			blk_mq_rq_ctx_init(q, ctx, rq, rw);
 			break;
@@ -244,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
-		gfp_t gfp, bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
 {
 	struct request *rq;
 
 	if (blk_mq_queue_enter(q))
 		return NULL;
 
-	rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
+	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
 	if (rq)
 		blk_mq_put_ctx(rq->mq_ctx);
 	return rq;
@@ -276,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
 /*
  * Re-init and set pdu, if we have it
  */
-static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
+void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	blk_rq_init(hctx->queue, rq);
 
@@ -290,9 +271,6 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 
-	if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ))
-		atomic_dec(&hctx->pending_flush);
-
 	blk_mq_rq_init(hctx, rq);
 	blk_mq_put_tag(hctx->tags, tag);
 
@@ -921,14 +899,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
 	trace_block_getrq(q, bio, rw);
-	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw);
+	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
 	if (likely(rq))
-		blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw);
+		blk_mq_rq_ctx_init(q, ctx, rq, rw);
 	else {
 		blk_mq_put_ctx(ctx);
 		trace_block_sleeprq(q, bio, rw);
-		rq = blk_mq_alloc_request_pinned(q, bio->bi_rw,
-				__GFP_WAIT|GFP_ATOMIC, false);
+		rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
+							false);
 		ctx = rq->mq_ctx;
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	}
@@ -1205,9 +1183,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		hctx->queue_num = i;
 		hctx->flags = reg->flags;
 		hctx->queue_depth = reg->queue_depth;
-		hctx->reserved_tags = reg->reserved_tags;
 		hctx->cmd_size = reg->cmd_size;
-		atomic_set(&hctx->pending_flush, 0);
 
 		blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 						blk_mq_hctx_notify, hctx);
@@ -1382,9 +1358,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 	blk_mq_init_flush(q);
 	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
 
-	if (blk_mq_init_hw_queues(q, reg, driver_data))
+	q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
+				cache_line_size()), GFP_KERNEL);
+	if (!q->flush_rq)
 		goto err_hw;
 
+	if (blk_mq_init_hw_queues(q, reg, driver_data))
+		goto err_flush_rq;
+
 	blk_mq_map_swqueue(q);
 
 	mutex_lock(&all_q_mutex);
@@ -1392,6 +1373,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 	mutex_unlock(&all_q_mutex);
 
 	return q;
+
+err_flush_rq:
+	kfree(q->flush_rq);
 err_hw:
 	kfree(q->mq_map);
 err_map:
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 5c39179..b771080 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -29,6 +29,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
 void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
+void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
 
 /*
  * CPU hotplug helpers
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8095c4a..7500f87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -549,6 +549,8 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
 
+	kfree(q->flush_rq);
+
 	blk_trace_shutdown(q);
 
 	bdi_destroy(&q->backing_dev_info);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1e8f16f..c1684ec 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -36,15 +36,12 @@ struct blk_mq_hw_ctx {
 	struct list_head	page_list;
 	struct blk_mq_tags	*tags;
 
-	atomic_t		pending_flush;
-
 	unsigned long		queued;
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	10
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	unsigned int		queue_depth;
-	unsigned int		reserved_tags;
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */
 
@@ -126,7 +123,7 @@ void blk_mq_insert_request(struct request_queue *, struct request *, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0375654..b2d25ec 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -101,7 +101,7 @@ struct request {
 	};
 	union {
 		struct call_single_data csd;
-		struct work_struct mq_flush_data;
+		struct work_struct mq_flush_work;
 	};
 
 	struct request_queue *q;
@@ -451,13 +451,8 @@ struct request_queue {
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
 	struct list_head	flush_data_in_flight;
-	union {
-		struct request	flush_rq;
-		struct {
-			spinlock_t mq_flush_lock;
-			struct work_struct mq_flush_work;
-		};
-	};
+	struct request		*flush_rq;
+	spinlock_t		mq_flush_lock;
 
 	struct mutex		sysfs_lock;
 
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-01-30 13:26 ` [PATCH 1/1] " Christoph Hellwig
@ 2014-02-07  1:18   ` Shaohua Li
  2014-02-07 14:19     ` Christoph Hellwig
  2014-03-07 20:45   ` Jeff Moyer
  1 sibling, 1 reply; 40+ messages in thread
From: Shaohua Li @ 2014-02-07  1:18 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-kernel

On Thu, Jan 30, 2014 at 05:26:30AM -0800, Christoph Hellwig wrote:
> Witch to using a preallocated flush_rq for blk-mq similar to what's done
> with the old request path.  This allows us to set up the request properly
> with a tag from the actually allowed range and ->rq_disk as needed by
> some drivers.  To make life easier we also switch to dynamic allocation
> of ->flush_rq for the old path.
> 
> This effectively reverts most of
> 
>     "blk-mq: fix for flush deadlock"
> 
> and
> 
>     "blk-mq: Don't reserve a tag for flush request"

Reusing the tag for flush request is considered before. The problem is driver
need get a request from a tag, reusing tag breaks this. The possible solution
is we provide a blk_mq_tag_to_request, and force driver uses it. And in this
function, if tag equals to flush_rq tag, we return flush_request.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-02-07  1:18   ` Shaohua Li
@ 2014-02-07 14:19     ` Christoph Hellwig
  2014-02-08  0:55       ` Shaohua Li
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2014-02-07 14:19 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Christoph Hellwig, Jens Axboe, linux-kernel

On Fri, Feb 07, 2014 at 09:18:27AM +0800, Shaohua Li wrote:
> Reusing the tag for flush request is considered before. The problem is driver
> need get a request from a tag, reusing tag breaks this. The possible solution
> is we provide a blk_mq_tag_to_request, and force driver uses it. And in this
> function, if tag equals to flush_rq tag, we return flush_request.

If we want to support tag to request reverse mapping we defintively
need to do this in the core, at which point special casing flush_rq
there is easy.

Which driver needs the reverse mapping? Neither virtio_blk nor null_blk
seem to and I've not seen any other conversions submitted except for the
scsi work.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-02-07 14:19     ` Christoph Hellwig
@ 2014-02-08  0:55       ` Shaohua Li
  2014-02-10 10:33         ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Shaohua Li @ 2014-02-08  0:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-kernel

On Fri, Feb 07, 2014 at 06:19:15AM -0800, Christoph Hellwig wrote:
> On Fri, Feb 07, 2014 at 09:18:27AM +0800, Shaohua Li wrote:
> > Reusing the tag for flush request is considered before. The problem is driver
> > need get a request from a tag, reusing tag breaks this. The possible solution
> > is we provide a blk_mq_tag_to_request, and force driver uses it. And in this
> > function, if tag equals to flush_rq tag, we return flush_request.
> 
> If we want to support tag to request reverse mapping we defintively
> need to do this in the core, at which point special casing flush_rq
> there is easy.
> 
> Which driver needs the reverse mapping? Neither virtio_blk nor null_blk
> seem to and I've not seen any other conversions submitted except for the
> scsi work.

Yep, none driver needs it. But reverse mapping is the one of the points we use
tag, so better we prepare it.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-02-08  0:55       ` Shaohua Li
@ 2014-02-10 10:33         ` Christoph Hellwig
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-02-10 10:33 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Christoph Hellwig, Jens Axboe, linux-kernel

On Sat, Feb 08, 2014 at 08:55:07AM +0800, Shaohua Li wrote:
> Yep, none driver needs it. But reverse mapping is the one of the points we use
> tag, so better we prepare it.

I don't think adding unused code to tree for a future driver is a good
idea, although I'm happy to help out with the functionality as soon
as the need arrives.

In the meantime we really need something like this fix to allow
non-trivial drivers to use blk-mq.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-01-30 13:26 ` [PATCH 1/1] " Christoph Hellwig
  2014-02-07  1:18   ` Shaohua Li
@ 2014-03-07 20:45   ` Jeff Moyer
  2014-03-08 15:52     ` Christoph Hellwig
  1 sibling, 1 reply; 40+ messages in thread
From: Jeff Moyer @ 2014-03-07 20:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, Shaohua Li, linux-kernel, msnitzer

Christoph Hellwig <hch@infradead.org> writes:

> Witch to using a preallocated flush_rq for blk-mq similar to what's done
> with the old request path.  This allows us to set up the request properly
> with a tag from the actually allowed range and ->rq_disk as needed by
> some drivers.  To make life easier we also switch to dynamic allocation
> of ->flush_rq for the old path.
>
> This effectively reverts most of
>
>     "blk-mq: fix for flush deadlock"
>
> and
>
>     "blk-mq: Don't reserve a tag for flush request"
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

[snip]

> -static void blk_mq_flush_data_insert(struct request *rq)
> +static bool blk_flush_queue_rq(struct request *rq)
>  {
> -	INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
> -	kblockd_schedule_work(rq->q, &rq->mq_flush_data);
> +	if (rq->q->mq_ops) {
> +		INIT_WORK(&rq->mq_flush_work, mq_flush_run);
> +		kblockd_schedule_work(rq->q, &rq->mq_flush_work);
> +		return false;
> +	} else {
> +		list_add_tail(&rq->queuelist, &rq->q->queue_head);
> +		return true;
> +	}
>  }
>  
>  /**
> @@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
>  
>  	case REQ_FSEQ_DATA:
>  		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
> -		if (q->mq_ops)
> -			blk_mq_flush_data_insert(rq);
> -		else {
> -			list_add(&rq->queuelist, &q->queue_head);
> -			queued = true;
> -		}
> +		queued = blk_flush_queue_rq(rq);
>  		break;

Hi, Christoph,

Did you mean to switch from list_add to list_add_tail?  That seems like
a change that warrants mention.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-07 20:45   ` Jeff Moyer
@ 2014-03-08 15:52     ` Christoph Hellwig
  2014-03-08 17:33       ` Mike Snitzer
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-08 15:52 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Jens Axboe, Shaohua Li, linux-kernel, msnitzer, Hannes Reinecke

On Fri, Mar 07, 2014 at 03:45:09PM -0500, Jeff Moyer wrote:
> Hi, Christoph,
> 
> Did you mean to switch from list_add to list_add_tail?  That seems like
> a change that warrants mention.

No, that wasn't intentional and should be fixed.  Btw, there was another
issue with that commit, in that dm-multipath also needs to allocate
->flush_rq.  I saw a patch from Hannes fixing it in the SuSE tree, and
would really love to see him submit that for mainline as well.

Unfortunately SuSE seems to have lots of block and dm fixes and even
features that they don't submit upstream.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-08 15:52     ` Christoph Hellwig
@ 2014-03-08 17:33       ` Mike Snitzer
  2014-03-08 19:51         ` Hannes Reinecke
  0 siblings, 1 reply; 40+ messages in thread
From: Mike Snitzer @ 2014-03-08 17:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel, msnitzer,
	Hannes Reinecke

On Sat, Mar 8, 2014 at 10:52 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Fri, Mar 07, 2014 at 03:45:09PM -0500, Jeff Moyer wrote:
>> Hi, Christoph,
>>
>> Did you mean to switch from list_add to list_add_tail?  That seems like
>> a change that warrants mention.
>
> No, that wasn't intentional and should be fixed.  Btw, there was another
> issue with that commit, in that dm-multipath also needs to allocate
> ->flush_rq.  I saw a patch from Hannes fixing it in the SuSE tree, and
> would really love to see him submit that for mainline as well.

Ugh, rq-based DM calls blk_init_allocated_queue.. (ah, looks like best
to move q->flush_rq allocation from blk_init_queue_node to
blk_alloc_queue_node?).  Anyway, this all makes sense given the
crashes we've been dealing with.. we couldn't immediately understand
how the q->flush_rq would be NULL... grr.  I guess I should've checked
with you sooner.  I reverted commit 1874198 "blk-mq: rework flush
sequencing logic" from RHEL7 just yesterday because we were seeing
crashes on flush with dm-mpath.  But can easily re-apply for RHEL7.1
(since the request_queue's embedded flush_rq takes up so much space we
get ample kABI padding).

Not overly proud of the revert, but I deemed easier to revert than
hunt down the fix given RHEL7 won't actually be providing any blk-mq
enabled drivers.  That'll change for RHEL7.1.

> Unfortunately SuSE seems to have lots of block and dm fixes and even
> features that they don't submit upstream.

Yeah, it is certainly disturbing.  No excuse for sitting on fixes like this.

Hannes, _please_ get this dm-mpath flush_rq fix for 3.14 posted ASAP.
Jens or I will need to get it to Linus next week.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-08 19:51         ` Hannes Reinecke
@ 2014-03-08 18:13           ` Mike Snitzer
  2014-03-08 21:33             ` Hannes Reinecke
  2014-03-12 10:28           ` [PATCH 1/1] block: rework flush sequencing for blk-mq Christoph Hellwig
  1 sibling, 1 reply; 40+ messages in thread
From: Mike Snitzer @ 2014-03-08 18:13 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Jeff Moyer, Jens Axboe, Shaohua Li,
	linux-kernel, msnitzer

On Sat, Mar 8, 2014 at 2:51 PM, Hannes Reinecke <hare@suse.de> wrote:
> On 03/08/2014 06:33 PM, Mike Snitzer wrote:
>>
>> On Sat, Mar 8, 2014 at 10:52 AM, Christoph Hellwig <hch@infradead.org>
>> wrote:
>>>
>>> On Fri, Mar 07, 2014 at 03:45:09PM -0500, Jeff Moyer wrote:
>>>>
>>>> Hi, Christoph,
>>>>
>>>> Did you mean to switch from list_add to list_add_tail?  That seems like
>>>> a change that warrants mention.
>>>
>>>
>>> No, that wasn't intentional and should be fixed.  Btw, there was another
>>> issue with that commit, in that dm-multipath also needs to allocate
>>> ->flush_rq.  I saw a patch from Hannes fixing it in the SuSE tree, and
>>> would really love to see him submit that for mainline as well.
>>
>>
>> Ugh, rq-based DM calls blk_init_allocated_queue.. (ah, looks like best
>> to move q->flush_rq allocation from blk_init_queue_node to
>> blk_alloc_queue_node?).

The above suggestion would work.  But we'd lose the side-effect
benefit to bio-based DM not needing q->flush_rq allocated at all.

But I'm not immediately seeing a clean way to get that benefit (of not
allocating for bio-based request_queue) while always allocating it for
request-based queues.

Actually, how about moving the flush_rq allocation to
blk_init_allocated_queue(), it'd accomplish both.. what do others
think of that?

>>> Unfortunately SuSE seems to have lots of block and dm fixes and even
>>> features that they don't submit upstream.
>>
>>
>> Yeah, it is certainly disturbing.  No excuse for sitting on fixes like
>> this.
>>
>> Hannes, _please_ get this dm-mpath flush_rq fix for 3.14 posted ASAP.
>> Jens or I will need to get it to Linus next week.
>>
> Hey, calm down.

I'm calm.. was just a bit frustrated.  But this isn't a big deal.
I'll make an effort to reach out to relevant people sooner when
similar stuff is reported against recently upstreamed code.  Would be
cool if you did the same.  I can relate to needing to have the distro
vendor hat on (first needing to determine/answer "is this issue
specific to our hacked distro kernel?", etc).

> I've made the fix just two days ago. And was quite surprised that I've been
> the first hitting that; should've crashed for everybody using dm-multipath.

Yeah, it is surprising that we haven't had any upstream reports.

> And given the pushback I've gotten recently from patches I would have
> thought that it would work for most users; sure the author would've done due
> diligence on the original patchset ...
> Plus I've gotten the reports from S/390, so I put it down to mainframe
> weirdness.
>
> BTW, it not _my_ decision to sit on tons of SUSE specific patches.

I'll take the bait.. this isn't a SUSE specific patch we're talking about. ;)

> I really try to get things upstream. But I cannot do more than sending
> patches upstream, answer patiently any questions, and redo the patchset.
> Which I did. Frequently, But, alas, it's up to the maintainer to apply them.
> And I can only ask and hope. The usual story...

Guess I'm missing context for which patches you're saying people are
sitting on.  (I doubt you're referring to your dm-mpath patchset on
dm-devel.. since it has gone 9ish iterations.  Now that everything is
sorted out I'll be getting it reviewed and staged for 3.15 next week).

> I'll be sending the patch soon, Monday at latest.

OK, looking forward to seeing it.  Would appreciate your feedback on
the questions/suggestions I posed above.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-08 17:33       ` Mike Snitzer
@ 2014-03-08 19:51         ` Hannes Reinecke
  2014-03-08 18:13           ` Mike Snitzer
  2014-03-12 10:28           ` [PATCH 1/1] block: rework flush sequencing for blk-mq Christoph Hellwig
  0 siblings, 2 replies; 40+ messages in thread
From: Hannes Reinecke @ 2014-03-08 19:51 UTC (permalink / raw)
  To: Mike Snitzer, Christoph Hellwig
  Cc: Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel, msnitzer

On 03/08/2014 06:33 PM, Mike Snitzer wrote:
> On Sat, Mar 8, 2014 at 10:52 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> On Fri, Mar 07, 2014 at 03:45:09PM -0500, Jeff Moyer wrote:
>>> Hi, Christoph,
>>>
>>> Did you mean to switch from list_add to list_add_tail?  That seems like
>>> a change that warrants mention.
>>
>> No, that wasn't intentional and should be fixed.  Btw, there was another
>> issue with that commit, in that dm-multipath also needs to allocate
>> ->flush_rq.  I saw a patch from Hannes fixing it in the SuSE tree, and
>> would really love to see him submit that for mainline as well.
>
> Ugh, rq-based DM calls blk_init_allocated_queue.. (ah, looks like best
> to move q->flush_rq allocation from blk_init_queue_node to
> blk_alloc_queue_node?).  Anyway, this all makes sense given the
> crashes we've been dealing with.. we couldn't immediately understand
> how the q->flush_rq would be NULL... grr.  I guess I should've checked
> with you sooner.  I reverted commit 1874198 "blk-mq: rework flush
> sequencing logic" from RHEL7 just yesterday because we were seeing
> crashes on flush with dm-mpath.  But can easily re-apply for RHEL7.1
> (since the request_queue's embedded flush_rq takes up so much space we
> get ample kABI padding).
>
> Not overly proud of the revert, but I deemed easier to revert than
> hunt down the fix given RHEL7 won't actually be providing any blk-mq
> enabled drivers.  That'll change for RHEL7.1.
>
>> Unfortunately SuSE seems to have lots of block and dm fixes and even
>> features that they don't submit upstream.
>
> Yeah, it is certainly disturbing.  No excuse for sitting on fixes like this.
>
> Hannes, _please_ get this dm-mpath flush_rq fix for 3.14 posted ASAP.
> Jens or I will need to get it to Linus next week.
>
Hey, calm down.
I've made the fix just two days ago. And was quite surprised that I've 
been the first hitting that; should've crashed for everybody using 
dm-multipath.
And given the pushback I've gotten recently from patches I would have 
thought that it would work for most users; sure the author would've done 
due diligence on the original patchset ...
Plus I've gotten the reports from S/390, so I put it down to mainframe 
weirdness.

BTW, it not _my_ decision to sit on tons of SUSE specific patches.
I really try to get things upstream. But I cannot do more than sending 
patches upstream, answer patiently any questions, and redo the patchset.
Which I did. Frequently, But, alas, it's up to the maintainer to apply 
them. And I can only ask and hope. The usual story...

I'll be sending the patch soon, Monday at latest.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-08 18:13           ` Mike Snitzer
@ 2014-03-08 21:33             ` Hannes Reinecke
  2014-03-08 22:09               ` [PATCH] block: fix q->flush_rq NULL pointer crash on dm-mpath flush Mike Snitzer
  0 siblings, 1 reply; 40+ messages in thread
From: Hannes Reinecke @ 2014-03-08 21:33 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, Jeff Moyer, Jens Axboe, Shaohua Li,
	linux-kernel, msnitzer

On 03/08/2014 07:13 PM, Mike Snitzer wrote:
> On Sat, Mar 8, 2014 at 2:51 PM, Hannes Reinecke <hare@suse.de> wrote:
>> On 03/08/2014 06:33 PM, Mike Snitzer wrote:
>>>
>>> On Sat, Mar 8, 2014 at 10:52 AM, Christoph Hellwig <hch@infradead.org>
>>> wrote:
>>>>
>>>> On Fri, Mar 07, 2014 at 03:45:09PM -0500, Jeff Moyer wrote:
>>>>>
>>>>> Hi, Christoph,
>>>>>
>>>>> Did you mean to switch from list_add to list_add_tail?  That seems like
>>>>> a change that warrants mention.
>>>>
>>>>
>>>> No, that wasn't intentional and should be fixed.  Btw, there was another
>>>> issue with that commit, in that dm-multipath also needs to allocate
>>>> ->flush_rq.  I saw a patch from Hannes fixing it in the SuSE tree, and
>>>> would really love to see him submit that for mainline as well.
>>>
>>>
>>> Ugh, rq-based DM calls blk_init_allocated_queue.. (ah, looks like best
>>> to move q->flush_rq allocation from blk_init_queue_node to
>>> blk_alloc_queue_node?).
>
> The above suggestion would work.  But we'd lose the side-effect
> benefit to bio-based DM not needing q->flush_rq allocated at all.
>
> But I'm not immediately seeing a clean way to get that benefit (of not
> allocating for bio-based request_queue) while always allocating it for
> request-based queues.
>
> Actually, how about moving the flush_rq allocation to
> blk_init_allocated_queue(), it'd accomplish both.. what do others
> think of that?
>
>>>> Unfortunately SuSE seems to have lots of block and dm fixes and even
>>>> features that they don't submit upstream.
>>>
>>>
>>> Yeah, it is certainly disturbing.  No excuse for sitting on fixes like
>>> this.
>>>
>>> Hannes, _please_ get this dm-mpath flush_rq fix for 3.14 posted ASAP.
>>> Jens or I will need to get it to Linus next week.
>>>
>> Hey, calm down.
>
> I'm calm.. was just a bit frustrated.  But this isn't a big deal.
> I'll make an effort to reach out to relevant people sooner when
> similar stuff is reported against recently upstreamed code.  Would be
> cool if you did the same.  I can relate to needing to have the distro
> vendor hat on (first needing to determine/answer "is this issue
> specific to our hacked distro kernel?", etc).
>
The patch I made wasn't in the context of 'recently upstreamed code', it 
was due to a backport Jan Kara did for our next distro kernels (3.12-based).
I then made a fix just by code inspection, which _looked_ as if should 
work, and the asked the reporter to test.
I figured that the same issue would be upstream, too, that's right.
But as of now I'm still waiting for feedback on that patch.

I was about to test the patch next week and check for upstream 
application. But surely _NOT_ before I've got any proof that the patch 
fixes the issue.

Due diligence and all that.

I've just applied the code to our kernel tree because it
a) looks as if it would fix the issue
and
b) we're still in beta testing, so having the patch in-kernel makes 
thing easier for testing.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH] block: fix q->flush_rq NULL pointer crash on dm-mpath flush
  2014-03-08 21:33             ` Hannes Reinecke
@ 2014-03-08 22:09               ` Mike Snitzer
  2014-03-09  0:24                 ` Jens Axboe
  0 siblings, 1 reply; 40+ messages in thread
From: Mike Snitzer @ 2014-03-08 22:09 UTC (permalink / raw)
  To: Hannes Reinecke, Jens Axboe
  Cc: Mike Snitzer, Christoph Hellwig, Jeff Moyer, Shaohua Li, linux-kernel

On Sat, Mar 08 2014 at  4:33pm -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 03/08/2014 07:13 PM, Mike Snitzer wrote:
> >
> >I'm calm.. was just a bit frustrated.  But this isn't a big deal.
> >I'll make an effort to reach out to relevant people sooner when
> >similar stuff is reported against recently upstreamed code.  Would be
> >cool if you did the same.  I can relate to needing to have the distro
> >vendor hat on (first needing to determine/answer "is this issue
> >specific to our hacked distro kernel?", etc).
> >
> The patch I made wasn't in the context of 'recently upstreamed
> code', it was due to a backport Jan Kara did for our next distro
> kernels (3.12-based).

"3.12-based" means nothing given all the backporting for SLES, much like
"3.10-based" means nothing in the context of RHEL7.

The only way this fix is applicable is in the context of "recently
upstreamed code", commit 1874198 ("blk-mq: rework flush sequencing
logic") went upstream for v3.14-rc3.

Jens, please feel free to queue this tested fix for 3.14-rc:

From: Mike Snitzer <snitzer@redhat.com>
Subject: block: fix q->flush_rq NULL pointer crash on dm-mpath flush

Commit 1874198 ("blk-mq: rework flush sequencing logic") switched
->flush_rq from being an embedded member of the request_queue structure
to being dynamically allocated in blk_init_queue_node().

Request-based DM multipath doesn't use blk_init_queue_node(), instead it
uses blk_alloc_queue_node() + blk_init_allocated_queue().  Because
commit 1874198 placed the dynamic allocation of ->flush_rq in
blk_init_queue_node() any flush issued to a dm-mpath device would crash
with a NULL pointer, e.g.:

BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffff8125037e>] blk_rq_init+0x1e/0xb0
PGD bb3c7067 PUD bb01d067 PMD 0
Oops: 0002 [#1] SMP
...
CPU: 5 PID: 5028 Comm: dt Tainted: G        W  O 3.14.0-rc3.snitm+ #10
...
task: ffff88032fb270e0 ti: ffff880079564000 task.ti: ffff880079564000
RIP: 0010:[<ffffffff8125037e>]  [<ffffffff8125037e>] blk_rq_init+0x1e/0xb0
RSP: 0018:ffff880079565c98  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000030
RDX: ffff880260c74048 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffff880079565ca8 R08: ffff880260aa1e98 R09: 0000000000000001
R10: ffff88032fa78500 R11: 0000000000000246 R12: 0000000000000000
R13: ffff880260aa1de8 R14: 0000000000000650 R15: 0000000000000000
FS:  00007f8d36a2a700(0000) GS:ffff88033fca0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000079b36000 CR4: 00000000000007e0
Stack:
 0000000000000000 ffff880260c74048 ffff880079565cd8 ffffffff81257a47
 ffff880260aa1de8 ffff880260c74048 0000000000000001 0000000000000000
 ffff880079565d08 ffffffff81257c2d 0000000000000000 ffff880260aa1de8
Call Trace:
 [<ffffffff81257a47>] blk_flush_complete_seq+0x2d7/0x2e0
 [<ffffffff81257c2d>] blk_insert_flush+0x1dd/0x210
 [<ffffffff8124ec59>] __elv_add_request+0x1f9/0x320
 [<ffffffff81250681>] ? blk_account_io_start+0x111/0x190
 [<ffffffff81253a4b>] blk_queue_bio+0x25b/0x330
 [<ffffffffa0020bf5>] dm_request+0x35/0x40 [dm_mod]
 [<ffffffff812530c0>] generic_make_request+0xc0/0x100
 [<ffffffff81253173>] submit_bio+0x73/0x140
 [<ffffffff811becdd>] submit_bio_wait+0x5d/0x80
 [<ffffffff81257528>] blkdev_issue_flush+0x78/0xa0
 [<ffffffff811c1f6f>] blkdev_fsync+0x3f/0x60
 [<ffffffff811b7fde>] vfs_fsync_range+0x1e/0x20
 [<ffffffff811b7ffc>] vfs_fsync+0x1c/0x20
 [<ffffffff811b81f1>] do_fsync+0x41/0x80
 [<ffffffff8118874e>] ? SyS_lseek+0x7e/0x80
 [<ffffffff811b8260>] SyS_fsync+0x10/0x20
 [<ffffffff8154c2d2>] system_call_fastpath+0x16/0x1b

Fix this by moving the ->flush_rq allocation from blk_init_queue_node()
to blk_init_allocated_queue().  blk_init_queue_node() also calls
blk_init_allocated_queue() so this change is functionality equivalent
for all blk_init_queue_node() callers.

Reported-by: Hannes Reinecke <hare@suse.de>
Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 block/blk-core.c |   17 ++++++-----------
 1 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 853f927..4cd5ffc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -693,20 +693,11 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	if (!uninit_q)
 		return NULL;
 
-	uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
-	if (!uninit_q->flush_rq)
-		goto out_cleanup_queue;
-
 	q = blk_init_allocated_queue(uninit_q, rfn, lock);
 	if (!q)
-		goto out_free_flush_rq;
-	return q;
+		blk_cleanup_queue(uninit_q);
 
-out_free_flush_rq:
-	kfree(uninit_q->flush_rq);
-out_cleanup_queue:
-	blk_cleanup_queue(uninit_q);
-	return NULL;
+	return q;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 
@@ -717,6 +708,10 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	if (!q)
 		return NULL;
 
+	q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
+	if (!q->flush_rq)
+		return NULL;
+
 	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 		return NULL;
 

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH] block: fix q->flush_rq NULL pointer crash on dm-mpath flush
  2014-03-08 22:09               ` [PATCH] block: fix q->flush_rq NULL pointer crash on dm-mpath flush Mike Snitzer
@ 2014-03-09  0:24                 ` Jens Axboe
  2014-03-09  0:57                   ` Mike Snitzer
  0 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2014-03-09  0:24 UTC (permalink / raw)
  To: Mike Snitzer, Hannes Reinecke
  Cc: Mike Snitzer, Christoph Hellwig, Jeff Moyer, Shaohua Li, linux-kernel

On 2014-03-08 15:09, Mike Snitzer wrote:
> On Sat, Mar 08 2014 at  4:33pm -0500,
> Hannes Reinecke <hare@suse.de> wrote:
>
>> On 03/08/2014 07:13 PM, Mike Snitzer wrote:
>>>
>>> I'm calm.. was just a bit frustrated.  But this isn't a big deal.
>>> I'll make an effort to reach out to relevant people sooner when
>>> similar stuff is reported against recently upstreamed code.  Would be
>>> cool if you did the same.  I can relate to needing to have the distro
>>> vendor hat on (first needing to determine/answer "is this issue
>>> specific to our hacked distro kernel?", etc).
>>>
>> The patch I made wasn't in the context of 'recently upstreamed
>> code', it was due to a backport Jan Kara did for our next distro
>> kernels (3.12-based).
>
> "3.12-based" means nothing given all the backporting for SLES, much like
> "3.10-based" means nothing in the context of RHEL7.
>
> The only way this fix is applicable is in the context of "recently
> upstreamed code", commit 1874198 ("blk-mq: rework flush sequencing
> logic") went upstream for v3.14-rc3.
>
> Jens, please feel free to queue this tested fix for 3.14-rc:

Thanks Mike, queued up. Also queued up the list addition reversal change.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: block: fix q->flush_rq NULL pointer crash on dm-mpath flush
  2014-03-09  0:24                 ` Jens Axboe
@ 2014-03-09  0:57                   ` Mike Snitzer
  2014-03-09  3:18                     ` Jens Axboe
  0 siblings, 1 reply; 40+ messages in thread
From: Mike Snitzer @ 2014-03-09  0:57 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Hannes Reinecke, Mike Snitzer, Christoph Hellwig, Jeff Moyer,
	Shaohua Li, linux-kernel

On Sat, Mar 08 2014 at  7:24pm -0500,
Jens Axboe <axboe@kernel.dk> wrote:

> On 2014-03-08 15:09, Mike Snitzer wrote:
> >On Sat, Mar 08 2014 at  4:33pm -0500,
> >Hannes Reinecke <hare@suse.de> wrote:
> >
> >>On 03/08/2014 07:13 PM, Mike Snitzer wrote:
> >>>
> >>>I'm calm.. was just a bit frustrated.  But this isn't a big deal.
> >>>I'll make an effort to reach out to relevant people sooner when
> >>>similar stuff is reported against recently upstreamed code.  Would be
> >>>cool if you did the same.  I can relate to needing to have the distro
> >>>vendor hat on (first needing to determine/answer "is this issue
> >>>specific to our hacked distro kernel?", etc).
> >>>
> >>The patch I made wasn't in the context of 'recently upstreamed
> >>code', it was due to a backport Jan Kara did for our next distro
> >>kernels (3.12-based).
> >
> >"3.12-based" means nothing given all the backporting for SLES, much like
> >"3.10-based" means nothing in the context of RHEL7.
> >
> >The only way this fix is applicable is in the context of "recently
> >upstreamed code", commit 1874198 ("blk-mq: rework flush sequencing
> >logic") went upstream for v3.14-rc3.
> >
> >Jens, please feel free to queue this tested fix for 3.14-rc:
> 
> Thanks Mike, queued up.

Thanks.

> Also queued up the list addition reversal change.

I had a look at what you queued, thing is commit 1874198 replaced code
in blk_kick_flush() that did use list_add_tail().  So getting back to
the way the original code  was (before 1874198) would need something
like the following patch.

But it isn't clear to me why we'd have the duality of front vs tail
additions for flushes.  Maybe Christoph knows?

diff --git a/block/blk-flush.c b/block/blk-flush.c
index f598f79..43e6b47 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -140,14 +140,17 @@ static void mq_flush_run(struct work_struct *work)
 	blk_mq_insert_request(rq, false, true, false);
 }
 
-static bool blk_flush_queue_rq(struct request *rq)
+static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
 	if (rq->q->mq_ops) {
 		INIT_WORK(&rq->mq_flush_work, mq_flush_run);
 		kblockd_schedule_work(rq->q, &rq->mq_flush_work);
 		return false;
 	} else {
-		list_add_tail(&rq->queuelist, &rq->q->queue_head);
+		if (add_front)
+			list_add(&rq->queuelist, &rq->q->queue_head);
+		else
+			list_add_tail(&rq->queuelist, &rq->q->queue_head);
 		return true;
 	}
 }
@@ -193,7 +196,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
-		queued = blk_flush_queue_rq(rq);
+		queued = blk_flush_queue_rq(rq, true);
 		break;
 
 	case REQ_FSEQ_DONE:
@@ -326,7 +329,7 @@ static bool blk_kick_flush(struct request_queue *q)
 	q->flush_rq->rq_disk = first_rq->rq_disk;
 	q->flush_rq->end_io = flush_end_io;
 
-	return blk_flush_queue_rq(q->flush_rq);
+	return blk_flush_queue_rq(q->flush_rq, false);
 }
 
 static void flush_data_end_io(struct request *rq, int error)

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: block: fix q->flush_rq NULL pointer crash on dm-mpath flush
  2014-03-09  0:57                   ` Mike Snitzer
@ 2014-03-09  3:18                     ` Jens Axboe
  2014-03-09  3:29                       ` Mike Snitzer
  0 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2014-03-09  3:18 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Hannes Reinecke, Mike Snitzer, Christoph Hellwig, Jeff Moyer,
	Shaohua Li, linux-kernel

On 2014-03-08 17:57, Mike Snitzer wrote:
> On Sat, Mar 08 2014 at  7:24pm -0500,
> Jens Axboe <axboe@kernel.dk> wrote:
>
>> On 2014-03-08 15:09, Mike Snitzer wrote:
>>> On Sat, Mar 08 2014 at  4:33pm -0500,
>>> Hannes Reinecke <hare@suse.de> wrote:
>>>
>>>> On 03/08/2014 07:13 PM, Mike Snitzer wrote:
>>>>>
>>>>> I'm calm.. was just a bit frustrated.  But this isn't a big deal.
>>>>> I'll make an effort to reach out to relevant people sooner when
>>>>> similar stuff is reported against recently upstreamed code.  Would be
>>>>> cool if you did the same.  I can relate to needing to have the distro
>>>>> vendor hat on (first needing to determine/answer "is this issue
>>>>> specific to our hacked distro kernel?", etc).
>>>>>
>>>> The patch I made wasn't in the context of 'recently upstreamed
>>>> code', it was due to a backport Jan Kara did for our next distro
>>>> kernels (3.12-based).
>>>
>>> "3.12-based" means nothing given all the backporting for SLES, much like
>>> "3.10-based" means nothing in the context of RHEL7.
>>>
>>> The only way this fix is applicable is in the context of "recently
>>> upstreamed code", commit 1874198 ("blk-mq: rework flush sequencing
>>> logic") went upstream for v3.14-rc3.
>>>
>>> Jens, please feel free to queue this tested fix for 3.14-rc:
>>
>> Thanks Mike, queued up.
>
> Thanks.
>
>> Also queued up the list addition reversal change.
>
> I had a look at what you queued, thing is commit 1874198 replaced code
> in blk_kick_flush() that did use list_add_tail().  So getting back to
> the way the original code  was (before 1874198) would need something
> like the following patch.
>
> But it isn't clear to me why we'd have the duality of front vs tail
> additions for flushes.  Maybe Christoph knows?

Not sure it'd even make a difference with the use case, but always tail 
would be broken. But the flushing in general is a bit of a nightmare, so 
I'd be inclined to add your full fix too, at least this late in -rc.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: block: fix q->flush_rq NULL pointer crash on dm-mpath flush
  2014-03-09  3:18                     ` Jens Axboe
@ 2014-03-09  3:29                       ` Mike Snitzer
  0 siblings, 0 replies; 40+ messages in thread
From: Mike Snitzer @ 2014-03-09  3:29 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Hannes Reinecke, Mike Snitzer, Christoph Hellwig, Jeff Moyer,
	Shaohua Li, linux-kernel

On Sat, Mar 08 2014 at 10:18pm -0500,
Jens Axboe <axboe@kernel.dk> wrote:

> On 2014-03-08 17:57, Mike Snitzer wrote:
> >
> >I had a look at what you queued, thing is commit 1874198 replaced code
> >in blk_kick_flush() that did use list_add_tail().  So getting back to
> >the way the original code  was (before 1874198) would need something
> >like the following patch.
> >
> >But it isn't clear to me why we'd have the duality of front vs tail
> >additions for flushes.  Maybe Christoph knows?
> 
> Not sure it'd even make a difference with the use case, but always
> tail would be broken. But the flushing in general is a bit of a
> nightmare, so I'd be inclined to add your full fix too, at least
> this late in -rc.

OK, please feel free to add my Signed-off-by.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-08 19:51         ` Hannes Reinecke
  2014-03-08 18:13           ` Mike Snitzer
@ 2014-03-12 10:28           ` Christoph Hellwig
  2014-03-12 10:50             ` Hannes Reinecke
  2014-03-13 16:13             ` [PATCH 1/1] block: rework flush sequencing for blk-mq Mike Snitzer
  1 sibling, 2 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-12 10:28 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Mike Snitzer, Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel, msnitzer

On Sat, Mar 08, 2014 at 08:51:18PM +0100, Hannes Reinecke wrote:
> Hey, calm down.
> I've made the fix just two days ago. And was quite surprised that
> I've been the first hitting that; should've crashed for everybody
> using dm-multipath.
> And given the pushback I've gotten recently from patches I would
> have thought that it would work for most users; sure the author
> would've done due diligence on the original patchset ...

There's very little testing of dm-multipath for upstream work, as I've
seen tons of avoidable breakage.  Doesn't help that it uses a special
code path that one else uses.

> BTW, it not _my_ decision to sit on tons of SUSE specific patches.
> I really try to get things upstream. But I cannot do more than
> sending patches upstream, answer patiently any questions, and redo
> the patchset.
> Which I did. Frequently, But, alas, it's up to the maintainer to
> apply them. And I can only ask and hope. The usual story...

Let's make this a little less personal.  Fact is that the SuSE trees
have tons of patches in there that never have even been sent upstream.
There's also tons that have been posted once or twice.  While I feel
your frustration with the SCSI process fully and we'll need to work on
that somehow, how about you do another round of dumping the DM patches
on the dm-devel list and Mike?

I'll ping some of the other worst offenders as time permits.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-12 10:28           ` [PATCH 1/1] block: rework flush sequencing for blk-mq Christoph Hellwig
@ 2014-03-12 10:50             ` Hannes Reinecke
  2014-03-12 10:55               ` Christoph Hellwig
  2014-03-12 11:00                 ` Christoph Hellwig
  2014-03-13 16:13             ` [PATCH 1/1] block: rework flush sequencing for blk-mq Mike Snitzer
  1 sibling, 2 replies; 40+ messages in thread
From: Hannes Reinecke @ 2014-03-12 10:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mike Snitzer, Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel, msnitzer

On 03/12/2014 11:28 AM, Christoph Hellwig wrote:
> On Sat, Mar 08, 2014 at 08:51:18PM +0100, Hannes Reinecke wrote:
>> Hey, calm down.
>> I've made the fix just two days ago. And was quite surprised that
>> I've been the first hitting that; should've crashed for everybody
>> using dm-multipath.
>> And given the pushback I've gotten recently from patches I would
>> have thought that it would work for most users; sure the author
>> would've done due diligence on the original patchset ...
> 
> There's very little testing of dm-multipath for upstream work, as I've
> seen tons of avoidable breakage.  Doesn't help that it uses a special
> code path that one else uses.
> 
>> BTW, it not _my_ decision to sit on tons of SUSE specific patches.
>> I really try to get things upstream. But I cannot do more than
>> sending patches upstream, answer patiently any questions, and redo
>> the patchset.
>> Which I did. Frequently, But, alas, it's up to the maintainer to
>> apply them. And I can only ask and hope. The usual story...
> 
> Let's make this a little less personal.  Fact is that the SuSE trees
> have tons of patches in there that never have even been sent upstream.
> There's also tons that have been posted once or twice.  While I feel
> your frustration with the SCSI process fully and we'll need to work on
> that somehow, how about you do another round of dumping the DM patches
> on the dm-devel list and Mike?
> 
> I'll ping some of the other worst offenders as time permits.
> 

Ok, down to the grubby details:

>From the device-mapper side we have these patches which are not
included upstream:

dm-mpath-accept-failed-paths:
  -> has been posted to dm-devel, and Mike Snitzer promised
     to review/check it.

dm-multipath-Improve-logging.patch
-> Already sent as part of the 'noqueue' patchset

dm-mpath-no-activate-for-offlined-paths
-> bugfix to 'dm-mpath-accept-failed-paths'

dm-table-switch-to-readonly:
-> Catch 'EROFS' errors during table creation and set
   the 'read-only' flag on the device-mapper device
   accordingly. Should be sent upstream, correct.

dm-mpath-no-partitions-feature
-> This adds a new feature 'no_partitions' to dm-multipath
   devices, which then cause 'kpartx' to _not_ create
   partitions on that device. That is required for
   virtual images, which you just want to pass to
   the guest as-is.
   Patch has been discussed at dm-devel, but got
   rejected/ignored on the grounds that there should be
   a different way of doing so. I'll happily restart
   discussion here ...

dm-initialize-flush_rq.patch:
-> Has been discussed already, will be refreshed with
   the patch from Mike.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-12 10:50             ` Hannes Reinecke
@ 2014-03-12 10:55               ` Christoph Hellwig
  2014-03-12 11:07                 ` Hannes Reinecke
  2014-03-12 11:00                 ` Christoph Hellwig
  1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-12 10:55 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Mike Snitzer, Jeff Moyer, Jens Axboe,
	Shaohua Li, linux-kernel, msnitzer

On Wed, Mar 12, 2014 at 11:50:13AM +0100, Hannes Reinecke wrote:
> >From the device-mapper side we have these patches which are not
> included upstream:

Another one on the dm side that seems useful is:

dm-emulate-blkrrpart-ioctl:
  Partitions on device-mapper devices are managed by kpartx (if at
  all). So if we were just to send out a 'change' event if someone
  called BLKRRPART on these devices, kpartx will be triggered via udev
  and can manage the partitions accordingly.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* SuSE O_DIRECT|O_NONBLOCK overload
@ 2014-03-12 11:00                 ` Christoph Hellwig
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-12 11:00 UTC (permalink / raw)
  To: NeilBrown
  Cc: Jens Axboe, Alexander Viro, Linus Torvalds, linux-kernel, linux-man

The SLES12 tree has various patches to implement special
O_DIRECT|O_NONBLOCK semantics for block devices:

	https://gitorious.org/opensuse/kernel-source/source/806eab3e4b02e798c1ae942440051f81c822ca35:patches.suse/block-nonblock-causes-failfast

this seems genuinely useful and I'd be really happy if people would do
this work upstream for two reasons:

 a) implementing different semantics only in a vendor kernel is a
    nightmare.  No proper way to document it in the man pages for
    example, and silent breakage of applications that expect it to be
    present, or even more nasty not present.
 b) Which brings us to: we had various issues with adding O_NONBLOCK to
    files that didn't support it before.  How well was this whole feature
    tested?


^ permalink raw reply	[flat|nested] 40+ messages in thread

* SuSE O_DIRECT|O_NONBLOCK overload
@ 2014-03-12 11:00                 ` Christoph Hellwig
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-12 11:00 UTC (permalink / raw)
  To: NeilBrown
  Cc: Jens Axboe, Alexander Viro, Linus Torvalds,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-man-u79uwXL29TY76Z2rM5mHXA

The SLES12 tree has various patches to implement special
O_DIRECT|O_NONBLOCK semantics for block devices:

	https://gitorious.org/opensuse/kernel-source/source/806eab3e4b02e798c1ae942440051f81c822ca35:patches.suse/block-nonblock-causes-failfast

this seems genuinely useful and I'd be really happy if people would do
this work upstream for two reasons:

 a) implementing different semantics only in a vendor kernel is a
    nightmare.  No proper way to document it in the man pages for
    example, and silent breakage of applications that expect it to be
    present, or even more nasty not present.
 b) Which brings us to: we had various issues with adding O_NONBLOCK to
    files that didn't support it before.  How well was this whole feature
    tested?

--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-12 10:55               ` Christoph Hellwig
@ 2014-03-12 11:07                 ` Hannes Reinecke
  0 siblings, 0 replies; 40+ messages in thread
From: Hannes Reinecke @ 2014-03-12 11:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mike Snitzer, Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel, msnitzer

On 03/12/2014 11:55 AM, Christoph Hellwig wrote:
> On Wed, Mar 12, 2014 at 11:50:13AM +0100, Hannes Reinecke wrote:
>> >From the device-mapper side we have these patches which are not
>> included upstream:
> 
> Another one on the dm side that seems useful is:
> 
> dm-emulate-blkrrpart-ioctl:
>   Partitions on device-mapper devices are managed by kpartx (if at
>   all). So if we were just to send out a 'change' event if someone
>   called BLKRRPART on these devices, kpartx will be triggered via udev
>   and can manage the partitions accordingly.
> 
Which I've omitted for a reason; looks like these kind of things it
handled in userspace nowadays.
It will also cause 'parted' to emit 'change' events when you just
display a device. Which is _not_ want we want.
So this patch will be pulled, even from SUSE.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: SuSE O_DIRECT|O_NONBLOCK overload
@ 2014-03-13  0:15                   ` NeilBrown
  0 siblings, 0 replies; 40+ messages in thread
From: NeilBrown @ 2014-03-13  0:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Alexander Viro, Linus Torvalds, linux-kernel, linux-man

[-- Attachment #1: Type: text/plain, Size: 3223 bytes --]

On Wed, 12 Mar 2014 04:00:15 -0700 Christoph Hellwig <hch@infradead.org>
wrote:

> The SLES12 tree has various patches to implement special
> O_DIRECT|O_NONBLOCK semantics for block devices:
> 
> 	https://gitorious.org/opensuse/kernel-source/source/806eab3e4b02e798c1ae942440051f81c822ca35:patches.suse/block-nonblock-causes-failfast
> 
> this seems genuinely useful and I'd be really happy if people would do
> this work upstream for two reasons:
> 
>  a) implementing different semantics only in a vendor kernel is a
>     nightmare.  No proper way to document it in the man pages for
>     example, and silent breakage of applications that expect it to be
>     present, or even more nasty not present.
>  b) Which brings us to: we had various issues with adding O_NONBLOCK to
>     files that didn't support it before.  How well was this whole feature
>     tested?


This "feature" was really just a hack because a particular customer needed
something in a particular situation.

At the core of this in my thinking is the 'failfast' BIO flag ... or 'flags'
really because there are now three of them.

They don't seem to be documented or uniformly supported or used much at
all.  dm-multipath uses one, and btrfs uses another.  There could be value in
using one or more or something in md but as they aren't documented and could
mean almost anything I have stayed away.
I tried adding some sort of 'failfast' support to md once and I would get
occasional failures from regular sata devices which otherwise appeared to be
working perfectly well.  So it seemed that "fast" was altogether *too* fast.

For a particular customer with some particular hardware there were issues
where that hardware could choose not to respond for extended periods.  So we
modified the driver to accept a 'timeout' module parameter and to cause
REQ_FAILFAST_DEV (I think) requests to fail with -ETIMEDOUT if they could not
be serviced in that time.

We then modified md to cope with that particular well-defined semantic. And
hacked "O_NONBLOCK" support in so that mdadm could access the device without
the risk of hanging indefinitely.

I would be happy to bring at least some of this functionality into mainline,
but I would need a "FAILFAST" flag that actually meant something useful and
was sufficiently well documented so that if some driver got it wrong, I would
be justified in blaming the driver for not meeting the expectations that I
encoded into md.

I think that the FAILFAST flag that I need would do some error recovery but
would be time limited.  Maybe a software TLER (Time Limited Error Recovery).

I also think there should probably be just one FAILFAST flag.  Where it was
the DEV or the TRANSPORT or the DRIVER that failed could be returned in the
error code for any caller that cared.  But as I don't know why the one became
three I could well be missing something important.


As for testing, only basic "does it function as expected" testing.
Part of the reason for only modifying O_NONBLOCK behaviour where O_DIRECT was
also set was to make it extremely unlikely that any code would use this
feature except code that specifically needed it.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: SuSE O_DIRECT|O_NONBLOCK overload
@ 2014-03-13  0:15                   ` NeilBrown
  0 siblings, 0 replies; 40+ messages in thread
From: NeilBrown @ 2014-03-13  0:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Alexander Viro, Linus Torvalds,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-man-u79uwXL29TY76Z2rM5mHXA

[-- Attachment #1: Type: text/plain, Size: 3249 bytes --]

On Wed, 12 Mar 2014 04:00:15 -0700 Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
wrote:

> The SLES12 tree has various patches to implement special
> O_DIRECT|O_NONBLOCK semantics for block devices:
> 
> 	https://gitorious.org/opensuse/kernel-source/source/806eab3e4b02e798c1ae942440051f81c822ca35:patches.suse/block-nonblock-causes-failfast
> 
> this seems genuinely useful and I'd be really happy if people would do
> this work upstream for two reasons:
> 
>  a) implementing different semantics only in a vendor kernel is a
>     nightmare.  No proper way to document it in the man pages for
>     example, and silent breakage of applications that expect it to be
>     present, or even more nasty not present.
>  b) Which brings us to: we had various issues with adding O_NONBLOCK to
>     files that didn't support it before.  How well was this whole feature
>     tested?


This "feature" was really just a hack because a particular customer needed
something in a particular situation.

At the core of this in my thinking is the 'failfast' BIO flag ... or 'flags'
really because there are now three of them.

They don't seem to be documented or uniformly supported or used much at
all.  dm-multipath uses one, and btrfs uses another.  There could be value in
using one or more or something in md but as they aren't documented and could
mean almost anything I have stayed away.
I tried adding some sort of 'failfast' support to md once and I would get
occasional failures from regular sata devices which otherwise appeared to be
working perfectly well.  So it seemed that "fast" was altogether *too* fast.

For a particular customer with some particular hardware there were issues
where that hardware could choose not to respond for extended periods.  So we
modified the driver to accept a 'timeout' module parameter and to cause
REQ_FAILFAST_DEV (I think) requests to fail with -ETIMEDOUT if they could not
be serviced in that time.

We then modified md to cope with that particular well-defined semantic. And
hacked "O_NONBLOCK" support in so that mdadm could access the device without
the risk of hanging indefinitely.

I would be happy to bring at least some of this functionality into mainline,
but I would need a "FAILFAST" flag that actually meant something useful and
was sufficiently well documented so that if some driver got it wrong, I would
be justified in blaming the driver for not meeting the expectations that I
encoded into md.

I think that the FAILFAST flag that I need would do some error recovery but
would be time limited.  Maybe a software TLER (Time Limited Error Recovery).

I also think there should probably be just one FAILFAST flag.  Where it was
the DEV or the TRANSPORT or the DRIVER that failed could be returned in the
error code for any caller that cared.  But as I don't know why the one became
three I could well be missing something important.


As for testing, only basic "does it function as expected" testing.
Part of the reason for only modifying O_NONBLOCK behaviour where O_DIRECT was
also set was to make it extremely unlikely that any code would use this
feature except code that specifically needed it.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-12 10:28           ` [PATCH 1/1] block: rework flush sequencing for blk-mq Christoph Hellwig
  2014-03-12 10:50             ` Hannes Reinecke
@ 2014-03-13 16:13             ` Mike Snitzer
  2014-03-14  9:25               ` Christoph Hellwig
  1 sibling, 1 reply; 40+ messages in thread
From: Mike Snitzer @ 2014-03-13 16:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hannes Reinecke, Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel

On Wed, Mar 12 2014 at  6:28am -0400,
Christoph Hellwig <hch@infradead.org> wrote:

> On Sat, Mar 08, 2014 at 08:51:18PM +0100, Hannes Reinecke wrote:
> > Hey, calm down.
> > I've made the fix just two days ago. And was quite surprised that
> > I've been the first hitting that; should've crashed for everybody
> > using dm-multipath.
> > And given the pushback I've gotten recently from patches I would
> > have thought that it would work for most users; sure the author
> > would've done due diligence on the original patchset ...
> 
> There's very little testing of dm-multipath for upstream work, as I've
> seen tons of avoidable breakage.  Doesn't help that it uses a special
> code path that one else uses.

Pretty ironic that in the same email that you ask someone to "Let's make
this a little less personal." you start by asserting upstream
dm-multipath sees very little testing -- and use your commit that
recently broke dm-multipath as the basis.  Anyway, please exapnd on what
you feel is broken with upstream dm-multipath.

And please be specific about whether it is SCSI/block or dm-multipath
code that has regressed.

> > BTW, it not _my_ decision to sit on tons of SUSE specific patches.
> > I really try to get things upstream. But I cannot do more than
> > sending patches upstream, answer patiently any questions, and redo
> > the patchset.
> > Which I did. Frequently, But, alas, it's up to the maintainer to
> > apply them. And I can only ask and hope. The usual story...
> 
> Let's make this a little less personal.  Fact is that the SuSE trees
> have tons of patches in there that never have even been sent upstream.
> There's also tons that have been posted once or twice.  While I feel
> your frustration with the SCSI process fully and we'll need to work on
> that somehow, how about you do another round of dumping the DM patches
> on the dm-devel list and Mike?
> 
> I'll ping some of the other worst offenders as time permits.
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-13 16:13             ` [PATCH 1/1] block: rework flush sequencing for blk-mq Mike Snitzer
@ 2014-03-14  9:25               ` Christoph Hellwig
  2014-03-14  9:30                 ` Hannes Reinecke
                                   ` (2 more replies)
  0 siblings, 3 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-14  9:25 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, Hannes Reinecke, Jeff Moyer, Jens Axboe,
	Shaohua Li, linux-kernel

On Thu, Mar 13, 2014 at 12:13:47PM -0400, Mike Snitzer wrote:
> Pretty ironic that in the same email that you ask someone to "Let's make
> this a little less personal." you start by asserting upstream
> dm-multipath sees very little testing -- and use your commit that
> recently broke dm-multipath as the basis.  Anyway, please exapnd on what
> you feel is broken with upstream dm-multipath.

Getting a little upset, eh?  I didn't say it's broken, I said it gets
very little testing.  The regression from me was found like so many
before only after it was backported o some enterprise kernel.

I think the problem here is two-fold:
 a) the hardware you use with dm-multipath isn't widely available.
 b) it uses a very special code path in the block layer no one else uses

a) might be fixable by having some RDAC or similar emulation in qemu if
someone wants to spend the effort.
b) is a bit harder, but we should think hard about it when rewriting the
multipath code to support blk-mq.  Talking about which I think trying to
use dm-multipath on any blk-mq device will go horribly crash and boom at
the moment.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14  9:25               ` Christoph Hellwig
@ 2014-03-14  9:30                 ` Hannes Reinecke
  2014-03-14 12:44                   ` Christoph Hellwig
  2014-03-14  9:34                 ` Christoph Hellwig
  2014-03-14 13:00                 ` Mike Snitzer
  2 siblings, 1 reply; 40+ messages in thread
From: Hannes Reinecke @ 2014-03-14  9:30 UTC (permalink / raw)
  To: Christoph Hellwig, Mike Snitzer
  Cc: Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel

On 03/14/2014 10:25 AM, Christoph Hellwig wrote:
> On Thu, Mar 13, 2014 at 12:13:47PM -0400, Mike Snitzer wrote:
>> Pretty ironic that in the same email that you ask someone to "Let's make
>> this a little less personal." you start by asserting upstream
>> dm-multipath sees very little testing -- and use your commit that
>> recently broke dm-multipath as the basis.  Anyway, please exapnd on what
>> you feel is broken with upstream dm-multipath.
> 
> Getting a little upset, eh?  I didn't say it's broken, I said it gets
> very little testing.  The regression from me was found like so many
> before only after it was backported o some enterprise kernel.
> 
> I think the problem here is two-fold:
>  a) the hardware you use with dm-multipath isn't widely available.
>  b) it uses a very special code path in the block layer no one else uses
> 
> a) might be fixable by having some RDAC or similar emulation in qemu if
> someone wants to spend the effort.
> b) is a bit harder, but we should think hard about it when rewriting the
> multipath code to support blk-mq.  Talking about which I think trying to
> use dm-multipath on any blk-mq device will go horribly crash and boom at
> the moment.
> 
That was actually one of my plans, move dm-multipath over to use
blk-mq. But then I'd need to discuss with Jens et al how to best
achieve this; the current static hctx allocation doesn't play well
with multipaths dynamic path management.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14  9:25               ` Christoph Hellwig
  2014-03-14  9:30                 ` Hannes Reinecke
@ 2014-03-14  9:34                 ` Christoph Hellwig
  2014-03-14  9:52                   ` Hannes Reinecke
  2014-03-14 13:00                 ` Mike Snitzer
  2 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-14  9:34 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Hannes Reinecke, Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel

On Fri, Mar 14, 2014 at 02:25:19AM -0700, Christoph Hellwig wrote:
> b) is a bit harder, but we should think hard about it when rewriting the
> multipath code to support blk-mq.  Talking about which I think trying to
> use dm-multipath on any blk-mq device will go horribly crash and boom at
> the moment.

Talking abnout crashing and burning.. Hannes, did you run this patch
past dm-devel and linux-scsi yet?  Don't quite like it but the problem
seems real..


From: Hannes Reinecke <hare@suse.de>
Subject: Kernel bug triggered in multipath
References: bnc#486001
Patch-mainline: not yet

Starting multipath on a cciss device will cause a kernel
warning to be triggered. Problem is that we're using the
->queuedata field of the request_queue to derefence the
scsi device; however, for other (non-SCSI) devices this
points to a totally different structure.
So we should rather be using accessors here which make
sure we're only returning valid SCSI device structures.

Signed-off-by: Hannes Reinecke <hare@suse.de>

---
 drivers/scsi/device_handler/scsi_dh.c |   10 +++++-----
 drivers/scsi/scsi_lib.c               |   11 +++++++++++
 include/scsi/scsi_device.h            |    1 +
 3 files changed, 17 insertions(+), 5 deletions(-)

--- a/drivers/scsi/device_handler/scsi_dh.c
+++ b/drivers/scsi/device_handler/scsi_dh.c
@@ -397,7 +397,7 @@ int scsi_dh_activate(struct request_queu
 	struct device *dev = NULL;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	sdev = q->queuedata;
+	sdev = scsi_device_from_queue(q);
 	if (!sdev) {
		spin_unlock_irqrestore(q->queue_lock, flags);
		err = SCSI_DH_NOSYS;
@@ -484,7 +484,7 @@ int scsi_dh_attach(struct request_queue
 		return -EINVAL;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	sdev = q->queuedata;
+	sdev = scsi_device_from_queue(q);
 	if (!sdev || !get_device(&sdev->sdev_gendev))
 		err = -ENODEV;
 	spin_unlock_irqrestore(q->queue_lock, flags);
@@ -512,7 +512,7 @@ void scsi_dh_detach(struct request_queue
 	struct scsi_device_handler *scsi_dh = NULL;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	sdev = q->queuedata;
+	sdev = scsi_device_from_queue(q);
 	if (!sdev || !get_device(&sdev->sdev_gendev))
 		sdev = NULL;
 	spin_unlock_irqrestore(q->queue_lock, flags);
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1594,6 +1594,17 @@ out:
 	spin_lock_irq(q->queue_lock);
 }
 
+struct scsi_device *scsi_device_from_queue(struct request_queue *q)
+{
+	struct scsi_device *sdev = NULL;
+
+	if (q->request_fn == scsi_request_fn)
+		sdev = q->queuedata;
+
+	return sdev;
+}
+EXPORT_SYMBOL_GPL(scsi_device_from_queue);
+
 u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
 {
 	struct device *host_dev;
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -300,6 +300,7 @@ extern void starget_for_each_device(stru
 extern void __starget_for_each_device(struct scsi_target *, void *,
 				      void (*fn)(struct scsi_device *,
 						 void *));
+extern struct scsi_device *scsi_device_from_queue(struct request_queue *);
 
 /* only exposed to implement shost_for_each_device */
 extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *,

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14  9:34                 ` Christoph Hellwig
@ 2014-03-14  9:52                   ` Hannes Reinecke
  2014-03-14 10:58                     ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Hannes Reinecke @ 2014-03-14  9:52 UTC (permalink / raw)
  To: Christoph Hellwig, Mike Snitzer
  Cc: Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel

On 03/14/2014 10:34 AM, Christoph Hellwig wrote:
> On Fri, Mar 14, 2014 at 02:25:19AM -0700, Christoph Hellwig wrote:
>> b) is a bit harder, but we should think hard about it when rewriting the
>> multipath code to support blk-mq.  Talking about which I think trying to
>> use dm-multipath on any blk-mq device will go horribly crash and boom at
>> the moment.
> 
> Talking abnout crashing and burning.. Hannes, did you run this patch
> past dm-devel and linux-scsi yet?  Don't quite like it but the problem
> seems real..
> 
No, I haven't. This issue is only exhibited if you try to run
multipath on a non-SCSI device (in this case it was cciss).
But then that project got abandoned, and there never was a machine
with a multipathed cciss controller.

Same issue with DASD; you _could_ potentially run multipath on DASD,
but all recent mainframes have a feature called 'hyperpav', which
essentially implements multipath support within the DASD driver. So
running multipath here won't buy you anything.
(Plus the DASD driver will only _ever_ return an I/O error
after is has had a response from the storage array.
Making it truly pointless to run multipathing ...)
The only valid use case would be xDR. But then RH apparently has
support for xDR even without that patch (otherwise they would have
asked for it, being the good upstream citizen as they claim to be,
right?) so it looks as if it's not needed there, neither.

So this patch hasn't had any application in the real world and
I haven't pursued with it upstream.

But if Mike feels it'll be a good idea nevertheless I can easily
send an updated version.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14  9:52                   ` Hannes Reinecke
@ 2014-03-14 10:58                     ` Christoph Hellwig
  2014-03-14 11:10                       ` Hannes Reinecke
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-14 10:58 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Mike Snitzer, Jeff Moyer, Jens Axboe,
	Shaohua Li, linux-kernel

On Fri, Mar 14, 2014 at 10:52:55AM +0100, Hannes Reinecke wrote:
> No, I haven't. This issue is only exhibited if you try to run
> multipath on a non-SCSI device (in this case it was cciss).
> But then that project got abandoned, and there never was a machine
> with a multipathed cciss controller.

We still shouldn't be able to easily crash the kernel because someone
configured the wrong device.  And with scsi-mq that wrong devices case
might become a lot more common..


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14 10:58                     ` Christoph Hellwig
@ 2014-03-14 11:10                       ` Hannes Reinecke
  0 siblings, 0 replies; 40+ messages in thread
From: Hannes Reinecke @ 2014-03-14 11:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mike Snitzer, Jeff Moyer, Jens Axboe, Shaohua Li, linux-kernel

On 03/14/2014 11:58 AM, Christoph Hellwig wrote:
> On Fri, Mar 14, 2014 at 10:52:55AM +0100, Hannes Reinecke wrote:
>> No, I haven't. This issue is only exhibited if you try to run
>> multipath on a non-SCSI device (in this case it was cciss).
>> But then that project got abandoned, and there never was a machine
>> with a multipathed cciss controller.
> 
> We still shouldn't be able to easily crash the kernel because someone
> configured the wrong device.  And with scsi-mq that wrong devices case
> might become a lot more common..
> 
Ok, will be resending.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14  9:30                 ` Hannes Reinecke
@ 2014-03-14 12:44                   ` Christoph Hellwig
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-14 12:44 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Mike Snitzer, Jeff Moyer, Jens Axboe,
	Shaohua Li, linux-kernel

On Fri, Mar 14, 2014 at 10:30:53AM +0100, Hannes Reinecke wrote:
> That was actually one of my plans, move dm-multipath over to use
> blk-mq. But then I'd need to discuss with Jens et al how to best
> achieve this; the current static hctx allocation doesn't play well
> with multipaths dynamic path management.

I'd say it the other way around: the clone + insert hacks in
dm-multipath don't work well with blk-mq.  Not allowing non-owned
requests is fundamentally part of the blk-mq model to allow things
like the integrated tag allocator and queue depth limiting or the
preallocated driver specific data.

Instead dm-multipath should alway resubmit the request like it already
does for the slow path for the first step.  Longer term we might be able
to operate using a cheaper temporary structure, but I'm not sure that's
going to be worth it.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14  9:25               ` Christoph Hellwig
  2014-03-14  9:30                 ` Hannes Reinecke
  2014-03-14  9:34                 ` Christoph Hellwig
@ 2014-03-14 13:00                 ` Mike Snitzer
  2014-03-14 13:23                   ` Christoph Hellwig
  2 siblings, 1 reply; 40+ messages in thread
From: Mike Snitzer @ 2014-03-14 13:00 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hannes Reinecke, Jeff Moyer, Jens Axboe, Shaohua Li,
	linux-kernel, dm-devel

On Fri, Mar 14 2014 at  5:25am -0400,
Christoph Hellwig <hch@infradead.org> wrote:

> On Thu, Mar 13, 2014 at 12:13:47PM -0400, Mike Snitzer wrote:
> > Pretty ironic that in the same email that you ask someone to "Let's make
> > this a little less personal." you start by asserting upstream
> > dm-multipath sees very little testing -- and use your commit that
> > recently broke dm-multipath as the basis.  Anyway, please exapnd on what
> > you feel is broken with upstream dm-multipath.
> 
> Getting a little upset, eh?  I didn't say it's broken, I said it gets
> very little testing.  The regression from me was found like so many
> before only after it was backported o some enterprise kernel.

Even _really_ basic dm-multipath testing would've uncovered this bug.
 
> I think the problem here is two-fold:
>  a) the hardware you use with dm-multipath isn't widely available.
>  b) it uses a very special code path in the block layer no one else uses
> 
> a) might be fixable by having some RDAC or similar emulation in qemu if
> someone wants to spend the effort.

The regression from the commit in question was easily reproduced/tested
using scsi_debug.  Just start the multipathd service and any scsi_debug
device in the system will get multipath'd.

> b) is a bit harder, but we should think hard about it when rewriting the
> multipath code to support blk-mq.  Talking about which I think trying to
> use dm-multipath on any blk-mq device will go horribly crash and boom at
> the moment.

If/when blk-mq/scsi-mq is used as the primary mechanism for multipathing
it must (initially anyway) but implemented in terms of a dm-multipath
fork (call it "dm-multiqueue"?).  We cannot take 6+ months of breaking
and then fixing dm-multipath.  When dm-multiqueue is more proven we can
look at the best way forward.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14 13:00                 ` Mike Snitzer
@ 2014-03-14 13:23                   ` Christoph Hellwig
  2014-03-14 14:13                     ` Mike Snitzer
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-14 13:23 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, Hannes Reinecke, Jeff Moyer, Jens Axboe,
	Shaohua Li, linux-kernel, dm-devel

On Fri, Mar 14, 2014 at 09:00:21AM -0400, Mike Snitzer wrote:
> > Getting a little upset, eh?  I didn't say it's broken, I said it gets
> > very little testing.  The regression from me was found like so many
> > before only after it was backported o some enterprise kernel.
> 
> Even _really_ basic dm-multipath testing would've uncovered this bug.

But major testing using dm, md and various low-level drivers didn't.  Do
you really expect everyone to remember to specificly test dm-multipath
for a core block change?  Without a nicely documented way to do it?

That being said I should have remembered that it's special, but even I
didn't and I'm sorry about that.

> > I think the problem here is two-fold:
> >  a) the hardware you use with dm-multipath isn't widely available.
> >  b) it uses a very special code path in the block layer no one else uses
> > 
> > a) might be fixable by having some RDAC or similar emulation in qemu if
> > someone wants to spend the effort.
> 
> The regression from the commit in question was easily reproduced/tested
> using scsi_debug.  Just start the multipathd service and any scsi_debug
> device in the system will get multipath'd.

Really?  That sounds a like a bug in the INQUIRY information returned by
scsi_debug.  Either way please write up these things in Documentation so
you can point people at it easily.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14 13:23                   ` Christoph Hellwig
@ 2014-03-14 14:13                     ` Mike Snitzer
  2014-03-15 13:28                       ` scsi_debug and mutipath, was " Christoph Hellwig
  2014-03-17 11:55                       ` [dm-devel] " Bryn M. Reeves
  0 siblings, 2 replies; 40+ messages in thread
From: Mike Snitzer @ 2014-03-14 14:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hannes Reinecke, Jeff Moyer, Jens Axboe, Shaohua Li,
	linux-kernel, dm-devel, Benjamin Marzinski

On Fri, Mar 14 2014 at  9:23am -0400,
Christoph Hellwig <hch@infradead.org> wrote:

> On Fri, Mar 14, 2014 at 09:00:21AM -0400, Mike Snitzer wrote:
> > > Getting a little upset, eh?  I didn't say it's broken, I said it gets
> > > very little testing.  The regression from me was found like so many
> > > before only after it was backported o some enterprise kernel.
> > 
> > Even _really_ basic dm-multipath testing would've uncovered this bug.
> 
> But major testing using dm, md and various low-level drivers didn't.  Do
> you really expect everyone to remember to specificly test dm-multipath
> for a core block change?  Without a nicely documented way to do it?
> 
> That being said I should have remembered that it's special, but even I
> didn't and I'm sorry about that.

I was more reacting to the assertion you made like multipath regresses
all the time.  I'm not faulting you at all for not having tested
multipath.  Hell, I even forget to test multipath more than I should.
/me says with shame

I'll work to add basic coverage for dm-multipath to the
device-mapper-test-suite.

> > > I think the problem here is two-fold:
> > >  a) the hardware you use with dm-multipath isn't widely available.
> > >  b) it uses a very special code path in the block layer no one else uses
> > > 
> > > a) might be fixable by having some RDAC or similar emulation in qemu if
> > > someone wants to spend the effort.
> > 
> > The regression from the commit in question was easily reproduced/tested
> > using scsi_debug.  Just start the multipathd service and any scsi_debug
> > device in the system will get multipath'd.
> 
> Really?  That sounds a like a bug in the INQUIRY information returned by
> scsi_debug.  Either way please write up these things in Documentation so
> you can point people at it easily.

Yeah, not sure why single path scsi_debug "just works", maybe it is a
"feature" of the older multipathd I have kicking around?, but for basic
data path testing scsi_debug is a quick means to an end.  I can look
closer at _why_ it gets multipathd in a bit.  But maybe Ben or Hannes
will have quicker insight?

For me, if multipathd is running, if I issue the following a multipath
device gets layered ontop of the associated scsi_debug created sd
device: modprobe scsi_debug dev_size_mb=1024

I think it useful to have scsi_debug work with multipath.  I know people
in Red Hat's QE organization have even simulated multiple paths with
it.. but I don't recall if they had to hack scsi_debug to do that.  I'll
try to find out.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: SuSE O_DIRECT|O_NONBLOCK overload
@ 2014-03-14 17:46                     ` Mike Christie
  0 siblings, 0 replies; 40+ messages in thread
From: Mike Christie @ 2014-03-14 17:46 UTC (permalink / raw)
  To: NeilBrown
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, Linus Torvalds,
	linux-kernel, linux-man

On 03/12/2014 07:15 PM, NeilBrown wrote:
> I also think there should probably be just one FAILFAST flag.  Where it was
> the DEV or the TRANSPORT or the DRIVER that failed could be returned in the
> error code for any caller that cared.  But as I don't know why the one became
> three I could well be missing something important.

It was for multipath. The problem was dm-multipath does not know what to
do with low level device errors, but wanted transport errors returned
quickly. Other drivers like the scsi_dh (formerly dm hardware handlers)
modules, want all errors fast failed.

I can add documentation or we can just change the code to better suite
your needs.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: SuSE O_DIRECT|O_NONBLOCK overload
@ 2014-03-14 17:46                     ` Mike Christie
  0 siblings, 0 replies; 40+ messages in thread
From: Mike Christie @ 2014-03-14 17:46 UTC (permalink / raw)
  To: NeilBrown
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, Linus Torvalds,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-man-u79uwXL29TY76Z2rM5mHXA

On 03/12/2014 07:15 PM, NeilBrown wrote:
> I also think there should probably be just one FAILFAST flag.  Where it was
> the DEV or the TRANSPORT or the DRIVER that failed could be returned in the
> error code for any caller that cared.  But as I don't know why the one became
> three I could well be missing something important.

It was for multipath. The problem was dm-multipath does not know what to
do with low level device errors, but wanted transport errors returned
quickly. Other drivers like the scsi_dh (formerly dm hardware handlers)
modules, want all errors fast failed.

I can add documentation or we can just change the code to better suite
your needs.
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 40+ messages in thread

* scsi_debug and mutipath, was Re: [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14 14:13                     ` Mike Snitzer
@ 2014-03-15 13:28                       ` Christoph Hellwig
  2014-03-17 11:55                       ` [dm-devel] " Bryn M. Reeves
  1 sibling, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2014-03-15 13:28 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Hannes Reinecke, linux-kernel, dm-devel, Benjamin Marzinski,
	Doug Gilbert, linux-scsi

On Fri, Mar 14, 2014 at 10:13:01AM -0400, Mike Snitzer wrote:
> I was more reacting to the assertion you made like multipath regresses
> all the time.  I'm not faulting you at all for not having tested
> multipath.  Hell, I even forget to test multipath more than I should.
> /me says with shame

And I didn't assert that, I just asserted that it gets little testing.

> Yeah, not sure why single path scsi_debug "just works", maybe it is a
> "feature" of the older multipathd I have kicking around?, but for basic
> data path testing scsi_debug is a quick means to an end.  I can look
> closer at _why_ it gets multipathd in a bit.  But maybe Ben or Hannes
> will have quicker insight?
> 
> For me, if multipathd is running, if I issue the following a multipath
> device gets layered ontop of the associated scsi_debug created sd
> device: modprobe scsi_debug dev_size_mb=1024
> 
> I think it useful to have scsi_debug work with multipath.  I know people
> in Red Hat's QE organization have even simulated multiple paths with
> it.. but I don't recall if they had to hack scsi_debug to do that.  I'll
> try to find out.

Looks like it really shouldn't attach as-is.  But scsi_debug should be
easily extendable to export two LUNs for the same backing store to help
multipath testing.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: rework flush sequencing for blk-mq
  2014-03-14 14:13                     ` Mike Snitzer
  2014-03-15 13:28                       ` scsi_debug and mutipath, was " Christoph Hellwig
@ 2014-03-17 11:55                       ` Bryn M. Reeves
  1 sibling, 0 replies; 40+ messages in thread
From: Bryn M. Reeves @ 2014-03-17 11:55 UTC (permalink / raw)
  To: device-mapper development
  Cc: Mike Snitzer, Christoph Hellwig, Jens Axboe, Shaohua Li,
	linux-kernel, Jeff Moyer

On 03/14/2014 02:13 PM, Mike Snitzer wrote:
> Yeah, not sure why single path scsi_debug "just works", maybe it is a
> "feature" of the older multipathd I have kicking around?, but for basic
> data path testing scsi_debug is a quick means to an end.  I can look
> closer at _why_ it gets multipathd in a bit.  But maybe Ben or Hannes
> will have quicker insight?

Do you have find_multipaths set in multipath.conf? It defaults to off.
If it's enabled then multipath will only create maps for WWIDs it
already knows about (listed in /etc/multipath/wwids) unless there are at
least two devices with the same WWID or the user forces the operation:

Mar 17 12:26:50 | checking if sdm should be multipathed
Mar 17 12:26:50 | wwid 35333333000000000 not in wwids file, skipping sdm

Turn it off and multipath will happily map a single path scsi_debug for me:

# multipath
create: mpathg (35333333000000000) undef Linux,scsi_debug
size=1.0G features='0' hwhandler='0' wp=undef
`-+- policy='round-robin 0' prio=1 status=undef
  `- 8:0:0:0 sdm 8:192 undef ready  running

Iirc this is enabled on at least some current distros by default to
avoid creating maps for USB keys and the like.

Regards,
Bryn.


^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2014-03-17 11:55 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-30 13:26 [PATCH 0/1] block: rework flush sequencing for blk-mq Christoph Hellwig
2014-01-30 13:26 ` [PATCH 1/1] " Christoph Hellwig
2014-02-07  1:18   ` Shaohua Li
2014-02-07 14:19     ` Christoph Hellwig
2014-02-08  0:55       ` Shaohua Li
2014-02-10 10:33         ` Christoph Hellwig
2014-03-07 20:45   ` Jeff Moyer
2014-03-08 15:52     ` Christoph Hellwig
2014-03-08 17:33       ` Mike Snitzer
2014-03-08 19:51         ` Hannes Reinecke
2014-03-08 18:13           ` Mike Snitzer
2014-03-08 21:33             ` Hannes Reinecke
2014-03-08 22:09               ` [PATCH] block: fix q->flush_rq NULL pointer crash on dm-mpath flush Mike Snitzer
2014-03-09  0:24                 ` Jens Axboe
2014-03-09  0:57                   ` Mike Snitzer
2014-03-09  3:18                     ` Jens Axboe
2014-03-09  3:29                       ` Mike Snitzer
2014-03-12 10:28           ` [PATCH 1/1] block: rework flush sequencing for blk-mq Christoph Hellwig
2014-03-12 10:50             ` Hannes Reinecke
2014-03-12 10:55               ` Christoph Hellwig
2014-03-12 11:07                 ` Hannes Reinecke
2014-03-12 11:00               ` SuSE O_DIRECT|O_NONBLOCK overload Christoph Hellwig
2014-03-12 11:00                 ` Christoph Hellwig
2014-03-13  0:15                 ` NeilBrown
2014-03-13  0:15                   ` NeilBrown
2014-03-14 17:46                   ` Mike Christie
2014-03-14 17:46                     ` Mike Christie
2014-03-13 16:13             ` [PATCH 1/1] block: rework flush sequencing for blk-mq Mike Snitzer
2014-03-14  9:25               ` Christoph Hellwig
2014-03-14  9:30                 ` Hannes Reinecke
2014-03-14 12:44                   ` Christoph Hellwig
2014-03-14  9:34                 ` Christoph Hellwig
2014-03-14  9:52                   ` Hannes Reinecke
2014-03-14 10:58                     ` Christoph Hellwig
2014-03-14 11:10                       ` Hannes Reinecke
2014-03-14 13:00                 ` Mike Snitzer
2014-03-14 13:23                   ` Christoph Hellwig
2014-03-14 14:13                     ` Mike Snitzer
2014-03-15 13:28                       ` scsi_debug and mutipath, was " Christoph Hellwig
2014-03-17 11:55                       ` [dm-devel] " Bryn M. Reeves

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.