All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V3 0/3] blk-mq: driver tag related cleanup
@ 2020-06-30 14:03 Ming Lei
  2020-06-30 14:03 ` [PATCH V3 1/3] blk-mq: move blk_mq_get_driver_tag into blk-mq.c Ming Lei
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Ming Lei @ 2020-06-30 14:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Ming Lei, Christoph Hellwig

Hi Jens,

The 1st & 2nd patch moves get/put driver tag helpers into blk-mq.c, and
the 3rd patch centralise related handling into blk_mq_get_driver_tag,
so both flush & blk-mq code get simplified.

V3:
	- rebase on latest for-5.9/block, only 3/3 is changed.

V2:
	- add reviewed-by tag
	- don't put blk_mq_tag_busy() into blk_mq_get_driver_tag
	- use BLK_MQ_NO_TAG in blk-flush.c

Ming Lei (3):
  blk-mq: move blk_mq_get_driver_tag into blk-mq.c
  blk-mq: move blk_mq_put_driver_tag() into blk-mq.c
  blk-mq: centralise related handling into blk_mq_get_driver_tag

 block/blk-flush.c  | 17 +++++-------
 block/blk-mq-tag.c | 58 ----------------------------------------
 block/blk-mq-tag.h | 41 +++++++++++++++++-----------
 block/blk-mq.c     | 66 +++++++++++++++++++++++++++++++++++++++++-----
 block/blk-mq.h     | 20 --------------
 block/blk.h        |  5 ----
 6 files changed, 91 insertions(+), 116 deletions(-)

Cc: Christoph Hellwig <hch@infradead.org>
-- 
2.25.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH V3 1/3] blk-mq: move blk_mq_get_driver_tag into blk-mq.c
  2020-06-30 14:03 [PATCH V3 0/3] blk-mq: driver tag related cleanup Ming Lei
@ 2020-06-30 14:03 ` Ming Lei
  2020-06-30 14:03 ` [PATCH V3 2/3] blk-mq: move blk_mq_put_driver_tag() " Ming Lei
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Ming Lei @ 2020-06-30 14:03 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, Christoph Hellwig, Hannes Reinecke,
	Christoph Hellwig

blk_mq_get_driver_tag() is only used by blk-mq.c and is supposed to
stay in blk-mq.c, so move it and preparing for cleanup code of
get/put driver tag.

Meantime hctx_may_queue() is moved to header file and it is fine
since it is defined as inline always.

No functional change.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c | 58 ----------------------------------------------
 block/blk-mq-tag.h | 39 ++++++++++++++++++++++++-------
 block/blk-mq.c     | 34 +++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 66 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 281367b04527..32d82e23b095 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -56,37 +56,6 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	blk_mq_tag_wakeup_all(tags, false);
 }
 
-/*
- * For shared tag users, we track the number of currently active users
- * and attempt to provide a fair share of the tag depth for each of them.
- */
-static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
-				  struct sbitmap_queue *bt)
-{
-	unsigned int depth, users;
-
-	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
-		return true;
-	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-		return true;
-
-	/*
-	 * Don't try dividing an ant
-	 */
-	if (bt->sb.depth == 1)
-		return true;
-
-	users = atomic_read(&hctx->tags->active_queues);
-	if (!users)
-		return true;
-
-	/*
-	 * Allow at least some tags
-	 */
-	depth = max((bt->sb.depth + users - 1) / users, 4U);
-	return atomic_read(&hctx->nr_active) < depth;
-}
-
 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 			    struct sbitmap_queue *bt)
 {
@@ -191,33 +160,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return tag + tag_offset;
 }
 
-bool __blk_mq_get_driver_tag(struct request *rq)
-{
-	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
-	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
-	bool shared = blk_mq_tag_busy(rq->mq_hctx);
-	int tag;
-
-	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
-		bt = &rq->mq_hctx->tags->breserved_tags;
-		tag_offset = 0;
-	}
-
-	if (!hctx_may_queue(rq->mq_hctx, bt))
-		return false;
-	tag = __sbitmap_queue_get(bt);
-	if (tag == BLK_MQ_NO_TAG)
-		return false;
-
-	rq->tag = tag + tag_offset;
-	if (shared) {
-		rq->rq_flags |= RQF_MQ_INFLIGHT;
-		atomic_inc(&rq->mq_hctx->nr_active);
-	}
-	rq->mq_hctx->tags->rqs[rq->tag] = rq;
-	return true;
-}
-
 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 		    unsigned int tag)
 {
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 2e4ef51cdb32..3945c7f5b944 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -51,14 +51,6 @@ enum {
 	BLK_MQ_TAG_MAX		= BLK_MQ_NO_TAG - 1,
 };
 
-bool __blk_mq_get_driver_tag(struct request *rq);
-static inline bool blk_mq_get_driver_tag(struct request *rq)
-{
-	if (rq->tag != BLK_MQ_NO_TAG)
-		return true;
-	return __blk_mq_get_driver_tag(rq);
-}
-
 extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 
@@ -78,6 +70,37 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 	__blk_mq_tag_idle(hctx);
 }
 
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+				  struct sbitmap_queue *bt)
+{
+	unsigned int depth, users;
+
+	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return true;
+	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return true;
+
+	/*
+	 * Don't try dividing an ant
+	 */
+	if (bt->sb.depth == 1)
+		return true;
+
+	users = atomic_read(&hctx->tags->active_queues);
+	if (!users)
+		return true;
+
+	/*
+	 * Allow at least some tags
+	 */
+	depth = max((bt->sb.depth + users - 1) / users, 4U);
+	return atomic_read(&hctx->nr_active) < depth;
+}
+
 /*
  * This helper should only be used for flush request to share tag
  * with the request cloned from, and both the two requests can't be
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 72d3034fe39d..6ed033b1f1e0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1103,6 +1103,40 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+static bool __blk_mq_get_driver_tag(struct request *rq)
+{
+	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+	bool shared = blk_mq_tag_busy(rq->mq_hctx);
+	int tag;
+
+	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+		bt = &rq->mq_hctx->tags->breserved_tags;
+		tag_offset = 0;
+	}
+
+	if (!hctx_may_queue(rq->mq_hctx, bt))
+		return false;
+	tag = __sbitmap_queue_get(bt);
+	if (tag == BLK_MQ_NO_TAG)
+		return false;
+
+	rq->tag = tag + tag_offset;
+	if (shared) {
+		rq->rq_flags |= RQF_MQ_INFLIGHT;
+		atomic_inc(&rq->mq_hctx->nr_active);
+	}
+	rq->mq_hctx->tags->rqs[rq->tag] = rq;
+	return true;
+}
+
+static bool blk_mq_get_driver_tag(struct request *rq)
+{
+	if (rq->tag != BLK_MQ_NO_TAG)
+		return true;
+	return __blk_mq_get_driver_tag(rq);
+}
+
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 				int flags, void *key)
 {
-- 
2.25.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH V3 2/3] blk-mq: move blk_mq_put_driver_tag() into blk-mq.c
  2020-06-30 14:03 [PATCH V3 0/3] blk-mq: driver tag related cleanup Ming Lei
  2020-06-30 14:03 ` [PATCH V3 1/3] blk-mq: move blk_mq_get_driver_tag into blk-mq.c Ming Lei
@ 2020-06-30 14:03 ` Ming Lei
  2020-06-30 14:03 ` [PATCH V3 3/3] blk-mq: centralise related handling into blk_mq_get_driver_tag Ming Lei
  2020-06-30 18:58 ` [PATCH V3 0/3] blk-mq: driver tag related cleanup Jens Axboe
  3 siblings, 0 replies; 7+ messages in thread
From: Ming Lei @ 2020-06-30 14:03 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, Christoph Hellwig, Hannes Reinecke,
	Christoph Hellwig

It is used by blk-mq.c only, so move it to the source file.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c | 20 ++++++++++++++++++++
 block/blk-mq.h | 20 --------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6ed033b1f1e0..e7a2d2d44b51 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -666,6 +666,26 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
 	return cpu_online(rq->mq_ctx->cpu);
 }
 
+static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+		struct request *rq)
+{
+	blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
+	rq->tag = BLK_MQ_NO_TAG;
+
+	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+		atomic_dec(&hctx->nr_active);
+	}
+}
+
+static inline void blk_mq_put_driver_tag(struct request *rq)
+{
+	if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
+		return;
+
+	__blk_mq_put_driver_tag(rq->mq_hctx, rq);
+}
+
 bool blk_mq_complete_request_remote(struct request *rq)
 {
 	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c6330335767c..a2b5cf6bac90 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -196,26 +196,6 @@ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
-static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-					   struct request *rq)
-{
-	blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
-	rq->tag = BLK_MQ_NO_TAG;
-
-	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-		atomic_dec(&hctx->nr_active);
-	}
-}
-
-static inline void blk_mq_put_driver_tag(struct request *rq)
-{
-	if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
-		return;
-
-	__blk_mq_put_driver_tag(rq->mq_hctx, rq);
-}
-
 static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
 {
 	int cpu;
-- 
2.25.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH V3 3/3] blk-mq: centralise related handling into blk_mq_get_driver_tag
  2020-06-30 14:03 [PATCH V3 0/3] blk-mq: driver tag related cleanup Ming Lei
  2020-06-30 14:03 ` [PATCH V3 1/3] blk-mq: move blk_mq_get_driver_tag into blk-mq.c Ming Lei
  2020-06-30 14:03 ` [PATCH V3 2/3] blk-mq: move blk_mq_put_driver_tag() " Ming Lei
@ 2020-06-30 14:03 ` Ming Lei
  2020-07-02  4:37   ` Qian Cai
  2020-06-30 18:58 ` [PATCH V3 0/3] blk-mq: driver tag related cleanup Jens Axboe
  3 siblings, 1 reply; 7+ messages in thread
From: Ming Lei @ 2020-06-30 14:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Ming Lei, Christoph Hellwig

Move .nr_active update and request assignment into blk_mq_get_driver_tag(),
all are good to do during getting driver tag.

Meantime blk-flush related code is simplified and flush request needn't
to update the request table manually any more.

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-flush.c  | 17 ++++++-----------
 block/blk-mq-tag.h | 12 ------------
 block/blk-mq.c     | 30 ++++++++++++++----------------
 block/blk.h        |  5 -----
 4 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 21108a550fbf..e756db088d84 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -236,12 +236,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		error = fq->rq_status;
 
 	hctx = flush_rq->mq_hctx;
-	if (!q->elevator) {
-		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
-		flush_rq->tag = -1;
-	} else {
-		flush_rq->internal_tag = -1;
-	}
+	if (!q->elevator)
+		flush_rq->tag = BLK_MQ_NO_TAG;
+	else
+		flush_rq->internal_tag = BLK_MQ_NO_TAG;
 
 	running = &fq->flush_queue[fq->flush_running_idx];
 	BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
@@ -315,13 +313,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	flush_rq->mq_ctx = first_rq->mq_ctx;
 	flush_rq->mq_hctx = first_rq->mq_hctx;
 
-	if (!q->elevator) {
-		fq->orig_rq = first_rq;
+	if (!q->elevator)
 		flush_rq->tag = first_rq->tag;
-		blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
-	} else {
+	else
 		flush_rq->internal_tag = first_rq->internal_tag;
-	}
 
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
 	flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 3945c7f5b944..b1acac518c4e 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -101,18 +101,6 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-/*
- * This helper should only be used for flush request to share tag
- * with the request cloned from, and both the two requests can't be
- * in flight at the same time. The caller has to make sure the tag
- * can't be freed.
- */
-static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
-		unsigned int tag, struct request *rq)
-{
-	hctx->tags->rqs[tag] = rq;
-}
-
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
 					  unsigned int tag)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e7a2d2d44b51..a20647139c95 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -277,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct request *rq = tags->static_rqs[tag];
-	req_flags_t rq_flags = 0;
 
 	if (data->q->elevator) {
 		rq->tag = BLK_MQ_NO_TAG;
 		rq->internal_tag = tag;
 	} else {
-		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
-			rq_flags = RQF_MQ_INFLIGHT;
-			atomic_inc(&data->hctx->nr_active);
-		}
 		rq->tag = tag;
 		rq->internal_tag = BLK_MQ_NO_TAG;
-		data->hctx->tags->rqs[rq->tag] = rq;
 	}
 
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
 	rq->mq_hctx = data->hctx;
-	rq->rq_flags = rq_flags;
+	rq->rq_flags = 0;
 	rq->cmd_flags = data->cmd_flags;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
@@ -1127,9 +1121,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
 {
 	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
 	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
-	bool shared = blk_mq_tag_busy(rq->mq_hctx);
 	int tag;
 
+	blk_mq_tag_busy(rq->mq_hctx);
+
 	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
 		bt = &rq->mq_hctx->tags->breserved_tags;
 		tag_offset = 0;
@@ -1142,19 +1137,22 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
 		return false;
 
 	rq->tag = tag + tag_offset;
-	if (shared) {
-		rq->rq_flags |= RQF_MQ_INFLIGHT;
-		atomic_inc(&rq->mq_hctx->nr_active);
-	}
-	rq->mq_hctx->tags->rqs[rq->tag] = rq;
 	return true;
 }
 
 static bool blk_mq_get_driver_tag(struct request *rq)
 {
-	if (rq->tag != BLK_MQ_NO_TAG)
-		return true;
-	return __blk_mq_get_driver_tag(rq);
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+		return false;
+
+	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		rq->rq_flags |= RQF_MQ_INFLIGHT;
+		atomic_inc(&hctx->nr_active);
+	}
+	hctx->tags->rqs[rq->tag] = rq;
+	return true;
 }
 
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
diff --git a/block/blk.h b/block/blk.h
index 41a50880c94e..0184a31fe4df 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -25,11 +25,6 @@ struct blk_flush_queue {
 	struct list_head	flush_data_in_flight;
 	struct request		*flush_rq;
 
-	/*
-	 * flush_rq shares tag with this rq, both can't be active
-	 * at the same time
-	 */
-	struct request		*orig_rq;
 	struct lock_class_key	key;
 	spinlock_t		mq_flush_lock;
 };
-- 
2.25.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 0/3] blk-mq: driver tag related cleanup
  2020-06-30 14:03 [PATCH V3 0/3] blk-mq: driver tag related cleanup Ming Lei
                   ` (2 preceding siblings ...)
  2020-06-30 14:03 ` [PATCH V3 3/3] blk-mq: centralise related handling into blk_mq_get_driver_tag Ming Lei
@ 2020-06-30 18:58 ` Jens Axboe
  3 siblings, 0 replies; 7+ messages in thread
From: Jens Axboe @ 2020-06-30 18:58 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Christoph Hellwig

On 6/30/20 8:03 AM, Ming Lei wrote:
> Hi Jens,
> 
> The 1st & 2nd patch moves get/put driver tag helpers into blk-mq.c, and
> the 3rd patch centralise related handling into blk_mq_get_driver_tag,
> so both flush & blk-mq code get simplified.

Applied, thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 3/3] blk-mq: centralise related handling into blk_mq_get_driver_tag
  2020-06-30 14:03 ` [PATCH V3 3/3] blk-mq: centralise related handling into blk_mq_get_driver_tag Ming Lei
@ 2020-07-02  4:37   ` Qian Cai
  2020-07-02  6:20     ` Ming Lei
  0 siblings, 1 reply; 7+ messages in thread
From: Qian Cai @ 2020-07-02  4:37 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Christoph Hellwig, m.szyprowski

On Tue, Jun 30, 2020 at 10:03:57PM +0800, Ming Lei wrote:
> Move .nr_active update and request assignment into blk_mq_get_driver_tag(),
> all are good to do during getting driver tag.
> 
> Meantime blk-flush related code is simplified and flush request needn't
> to update the request table manually any more.
> 
> Cc: Christoph Hellwig <hch@infradead.org>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

Reverting this commit on the top of next-20200701 fixed an issue where
swapping is unable to move progress for hours while it will only take
5-min after the reverting.

# time ./random 0
- start: alloc_mmap
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
- mmap 10% memory: 13473664204
real	2m12.333s
user	0m1.131s
sys	4m15.689s

# git clone https://github.com/cailca/linux-mm
# cd linux-mm; make
# ./random 0 (just malloc and CoW to trigger swapping.)

x86.config is also included there.

The server is,

HP Proliant BL660Gen9
Intel(R) Xeon(R) CPU E5-4650 v3 @ 2.10GHz
131072 MB memory, 1000 GB disk space

The swap device is on LVM,
06:00.0 RAID bus controller: Hewlett-Packard Company Smart Array Gen9 Controllers (rev 01)

# lvs
  LV   VG                     Attr       LSize   Pool Origin Data%  Meta%  Move Log Cpy%Sync Convert
  home rhel_hpe-bl660cgen9-01 -wi-ao---- 875.89g
  root rhel_hpe-bl660cgen9-01 -wi-ao----  50.00g
  swap rhel_hpe-bl660cgen9-01 -wi-ao----   4.00g

With the commit, the system will be totally locked up (unable to login
etc) when pageout. After the while, some of those messages are in the
serial console,

[ 1250.372489][  T352] INFO: task xfsaild/dm-0:795 blocked for more than 122 seconds.
[ 1250.410651][  T352]       Not tainted 5.8.0-rc3-next-20200701 #4
[ 1250.440777][  T352] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1250.480718][  T352] xfsaild/dm-0    D26184   795      2 0x00004000
[ 1250.509775][  T352] Call Trace:
[ 1250.524445][  T352]  __schedule+0x768/0x1ba0
[ 1250.544935][  T352]  ? kcore_callback+0x1d/0x1d
[ 1250.566408][  T352]  ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 1250.594339][  T352]  schedule+0xc1/0x2b0
[ 1250.612820][  T352]  schedule_timeout+0x390/0x8f0
[ 1250.635556][  T352]  ? usleep_range+0x120/0x120
[ 1250.656769][  T352]  ? lock_downgrade+0x720/0x720
[ 1250.679531][  T352]  ? rcu_read_unlock+0x50/0x50
[ 1250.701174][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1250.728811][  T352]  ? _raw_spin_unlock_irq+0x1f/0x30
[ 1250.752964][  T352]  ? trace_hardirqs_on+0x20/0x1b5
[ 1250.776427][  T352]  wait_for_completion+0x15e/0x250
[ 1250.799632][  T352]  ? wait_for_completion_interruptible+0x2f0/0x2f0
[ 1250.830311][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1250.858125][  T352]  ? _raw_spin_unlock_irq+0x1f/0x30
[ 1250.881934][  T352]  __flush_work+0x42a/0x930
[ 1250.903256][  T352]  ? queue_delayed_work_on+0x90/0x90
[ 1250.929838][  T352]  ? init_pwq+0x340/0x340
[ 1250.951021][  T352]  ? trace_hardirqs_on+0x20/0x1b5
[ 1250.974344][  T352]  xlog_cil_force_lsn+0x105/0x520
[ 1250.997409][  T352]  ? xfs_log_commit_cil+0x27f0/0x27f0
[ 1251.022702][  T352]  ? xfsaild_push+0x320/0x2170
[ 1251.044523][  T352]  ? rcu_read_lock_sched_held+0xaa/0xd0
[ 1251.070639][  T352]  ? do_raw_spin_lock+0x121/0x290
[ 1251.093099][  T352]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 1251.116765][  T352]  ? rwlock_bug.part.1+0x90/0x90
[ 1251.138491][  T352]  ? xfsaild_push+0x320/0x2170
[ 1251.160021][  T352]  xfs_log_force+0x283/0x8b0
[ 1251.181024][  T352]  xfsaild_push+0x320/0x2170
[ 1251.201236][  T352]  ? usleep_range+0x120/0x120
[ 1251.222285][  T352]  ? lock_downgrade+0x720/0x720
[ 1251.244945][  T352]  ? rcu_read_unlock+0x50/0x50
[ 1251.267111][  T352]  ? xfs_trans_ail_cursor_first+0x180/0x180
[ 1251.294879][  T352]  xfsaild+0x172/0x870
[ 1251.313497][  T352]  ? rwlock_bug.part.1+0x90/0x90
[ 1251.338272][  T352]  ? xfsaild_push+0x2170/0x2170
[ 1251.361313][  T352]  ? __kthread_parkme+0x4d/0x1a0
[ 1251.384386][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1251.413272][  T352]  ? _raw_spin_unlock_irqrestore+0x39/0x40
[ 1251.440768][  T352]  ? trace_hardirqs_on+0x20/0x1b5
[ 1251.464454][  T352]  ? __kthread_parkme+0xcc/0x1a0
[ 1251.487684][  T352]  ? xfsaild_push+0x2170/0x2170
[ 1251.511076][  T352]  kthread+0x358/0x420
[ 1251.529882][  T352]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[ 1251.556883][  T352]  ret_from_fork+0x22/0x30
[ 1251.577642][  T352] INFO: task systemd-logind:1255 blocked for more than 124 seconds.
[ 1251.614691][  T352]       Not tainted 5.8.0-rc3-next-20200701 #4
[ 1251.643014][  T352] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1251.682935][  T352] systemd-logind  D21280  1255      1 0x00004320
[ 1251.711905][  T352] Call Trace:
[ 1251.726589][  T352]  __schedule+0x768/0x1ba0
[ 1251.747076][  T352]  ? kcore_callback+0x1d/0x1d
[ 1251.768197][  T352]  ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 1251.797042][  T352]  schedule+0xc1/0x2b0
[ 1251.816384][  T352]  schedule_timeout+0x390/0x8f0
[ 1251.839135][  T352]  ? print_irqtrace_events+0x270/0x270
[ 1251.864164][  T352]  ? usleep_range+0x120/0x120
[ 1251.886086][  T352]  ? mark_held_locks+0xb0/0x110
[ 1251.908092][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1251.936499][  T352]  ? _raw_spin_unlock_irq+0x1f/0x30
[ 1251.961908][  T352]  ? trace_hardirqs_on+0x20/0x1b5
[ 1251.986191][  T352]  wait_for_completion+0x15e/0x250
[ 1252.011073][  T352]  ? wait_for_completion_interruptible+0x2f0/0x2f0
[ 1252.041028][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1252.068947][  T352]  ? _raw_spin_unlock_irq+0x1f/0x30
[ 1252.092838][  T352]  __flush_work+0x42a/0x930
[ 1252.113605][  T352]  ? queue_delayed_work_on+0x90/0x90
[ 1252.138605][  T352]  ? mark_lock+0x147/0x1800
[ 1252.159065][  T352]  ? init_pwq+0x340/0x340
[ 1252.178604][  T352]  ? __lock_acquire+0x1920/0x4da0
[ 1252.202405][  T352]  xlog_cil_force_lsn+0x105/0x520
[ 1252.225340][  T352]  ? xfs_log_commit_cil+0x27f0/0x27f0
[ 1252.250481][  T352]  ? xfs_iunpin_wait+0x25d/0x570
[ 1252.275021][  T352]  ? rcu_read_lock_sched_held+0xaa/0xd0
[ 1252.301548][  T352]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 1252.326485][  T352]  ? xfs_iunpin_wait+0x25d/0x570
[ 1252.350295][  T352]  xfs_log_force_lsn+0x237/0x590
[ 1252.372860][  T352]  xfs_iunpin_wait+0x25d/0x570
[ 1252.395136][  T352]  ? xfs_reclaim_inode+0x1d6/0x860
[ 1252.418578][  T352]  ? xfs_inactive+0x350/0x350
[ 1252.439965][  T352]  ? rcu_read_unlock+0x50/0x50
[ 1252.462896][  T352]  ? var_wake_function+0x150/0x150
[ 1252.488702][  T352]  ? xfs_reclaim_inode_grab+0xbf/0x100
[ 1252.513940][  T352]  xfs_reclaim_inode+0x1d6/0x860
[ 1252.536360][  T352]  ? xfs_inode_clear_reclaim_tag+0xa0/0xa0
[ 1252.562673][  T352]  ? do_raw_spin_unlock+0x4f/0x250
[ 1252.587647][  T352]  xfs_reclaim_inodes_ag+0x505/0xb00
[ 1252.612344][  T352]  ? xfs_reclaim_inode+0x860/0x860
[ 1252.635670][  T352]  ? mark_held_locks+0xb0/0x110
[ 1252.657425][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1252.685355][  T352]  ? _raw_spin_unlock_irqrestore+0x39/0x40
[ 1252.712212][  T352]  ? trace_hardirqs_on+0x20/0x1b5
[ 1252.735256][  T352]  ? try_to_wake_up+0x6d1/0xf40
[ 1252.757809][  T352]  ? migrate_swap_stop+0xbf0/0xbf0
[ 1252.780846][  T352]  ? do_raw_spin_unlock+0x4f/0x250
[ 1252.803908][  T352]  xfs_reclaim_inodes_nr+0x93/0xd0
[ 1252.828191][  T352]  ? xfs_reclaim_inodes+0x90/0x90
[ 1252.851060][  T352]  ? list_lru_count_one+0x177/0x300
[ 1252.875228][  T352]  super_cache_scan+0x2fd/0x430
[ 1252.897411][  T352]  do_shrink_slab+0x317/0x990
[ 1252.919083][  T352]  shrink_slab+0x3a8/0x4b0
[ 1252.939077][  T352]  ? do_shrink_slab+0x990/0x990
[ 1252.962308][  T352]  ? mem_cgroup_protected+0x228/0x470
[ 1252.988350][  T352]  shrink_node+0x49c/0x17b0
[ 1253.010025][  T352]  do_try_to_free_pages+0x348/0x1250
[ 1253.036410][  T352]  ? shrink_node+0x17b0/0x17b0
[ 1253.058428][  T352]  ? rcu_read_lock_sched_held+0xaa/0xd0
[ 1253.083861][  T352]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 1253.108752][  T352]  try_to_free_pages+0x244/0x690
[ 1253.131224][  T352]  ? free_unref_page_prepare.part.42+0x160/0x170
[ 1253.161131][  T352]  ? reclaim_pages+0x9a0/0x9a0
[ 1253.183202][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1253.211059][  T352]  __alloc_pages_slowpath.constprop.73+0x7b8/0x2410
[ 1253.241737][  T352]  ? warn_alloc+0x120/0x120
[ 1253.262497][  T352]  ? __alloc_pages_nodemask+0x3b7/0x7c0
[ 1253.288529][  T352]  ? lock_downgrade+0x720/0x720
[ 1253.310739][  T352]  ? __isolate_free_page+0x570/0x570
[ 1253.336306][  T352]  ? rcu_read_lock_sched_held+0xaa/0xd0
[ 1253.361726][  T352]  __alloc_pages_nodemask+0x658/0x7c0
[ 1253.386922][  T352]  ? __alloc_pages_slowpath.constprop.73+0x2410/0x2410
[ 1253.418811][  T352]  allocate_slab+0x378/0x680
[ 1253.439933][  T352]  new_slab+0x3c/0x60
[ 1253.458006][  T352]  ___slab_alloc+0x49b/0x7b0
[ 1253.479756][  T352]  ? getname_kernel+0x49/0x290
[ 1253.502868][  T352]  ? getname_kernel+0x49/0x290
[ 1253.525982][  T352]  ? __slab_alloc+0x35/0x70
[ 1253.547838][  T352]  __slab_alloc+0x35/0x70
[ 1253.567937][  T352]  ? getname_kernel+0x49/0x290
[ 1253.590116][  T352]  kmem_cache_alloc+0x230/0x2a0
[ 1253.612752][  T352]  getname_kernel+0x49/0x290
[ 1253.634286][  T352]  kern_path+0xc/0x30
[ 1253.652442][  T352]  unix_find_other+0xab/0x620
[ 1253.673847][  T352]  ? unix_stream_recvmsg+0xc0/0xc0
[ 1253.697111][  T352]  ? find_held_lock+0x33/0x1c0
[ 1253.719156][  T352]  ? __might_fault+0xea/0x1a0
[ 1253.740036][  T352]  unix_dgram_sendmsg+0x79f/0xe70
[ 1253.763719][  T352]  ? unix_dgram_connect+0x650/0x650
[ 1253.787501][  T352]  ? _copy_from_user+0xbe/0x100
[ 1253.810230][  T352]  ? rw_copy_check_uvector+0x5e/0x380
[ 1253.834712][  T352]  ? move_addr_to_kernel.part.25+0x28/0x40
[ 1253.861330][  T352]  ? __x64_sys_shutdown+0x70/0x70
[ 1253.883957][  T352]  ? dup_iter+0x240/0x240
[ 1253.903962][  T352]  ? unix_dgram_connect+0x650/0x650
[ 1253.927796][  T352]  ____sys_sendmsg+0x5f5/0x820
[ 1253.949832][  T352]  ? sock_create_kern+0x10/0x10
[ 1253.972027][  T352]  ? __copy_msghdr_from_user+0x3e0/0x3e0
[ 1253.999411][  T352]  ? sock_setsockopt+0x37c/0x1f60
[ 1254.024662][  T352]  ? sock_setsockopt+0x37c/0x1f60
[ 1254.048119][  T352]  ___sys_sendmsg+0xe5/0x170
[ 1254.069009][  T352]  ? sock_getsockopt+0x577/0x1ba3
[ 1254.092402][  T352]  ? copy_msghdr_from_user+0xe0/0xe0
[ 1254.116532][  T352]  ? sk_get_meminfo+0x4b0/0x4b0
[ 1254.139128][  T352]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 1254.162917][  T352]  ? ___bpf_prog_run+0x30c6/0x76a0
[ 1254.186532][  T352]  ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 1254.214390][  T352]  ? lockdep_init_map_waits+0x267/0x7b0
[ 1254.239719][  T352]  ? __bpf_prog_run64+0xc0/0xc0
[ 1254.261805][  T352]  ? __sys_setsockopt+0x39a/0x430
[ 1254.285143][  T352]  ? __fget_light+0x4c/0x220
[ 1254.306399][  T352]  ? sockfd_lookup_light+0x17/0x140
[ 1254.330417][  T352]  __sys_sendmsg+0xba/0x150
[ 1254.351957][  T352]  ? __sys_sendmsg_sock+0xa0/0xa0
[ 1254.375087][  T352]  ? seccomp_do_user_notification.isra.14+0xa60/0xa60
[ 1254.406754][  T352]  ? sockfd_lookup_light+0x140/0x140
[ 1254.431071][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1254.459401][  T352]  ? do_syscall_64+0x24/0x310
[ 1254.480731][  T352]  do_syscall_64+0x5f/0x310
[ 1254.502045][  T352]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1254.530295][  T352] RIP: 0033:0x7fd5b4dbdad5
[ 1254.551521][  T352] Code: Bad RIP value.
[ 1254.570837][  T352] RSP: 002b:00007ffd4294cdb8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[ 1254.610502][  T352] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fd5b4dbdad5
[ 1254.647167][  T352] RDX: 0000000000004000 RSI: 00007ffd4294cdf0 RDI: 0000000000000010
[ 1254.684282][  T352] RBP: 00007ffd4294cef0 R08: 000000000000006c R09: 0000000000017be2
[ 1254.721279][  T352] R10: 00007ffd4294cd90 R11: 0000000000000246 R12: 0000000000000010
[ 1254.758535][  T352] R13: 00007ffd4294cfc0 R14: 00007ffd4294ce30 R15: 0000000000000000
[ 1254.794642][  T352] INFO: task random:1774 blocked for more than 127 seconds.
[ 1254.827372][  T352]       Not tainted 5.8.0-rc3-next-20200701 #4
[ 1254.854803][  T352] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1254.893736][  T352] random          D22488  1774   1765 0x00000000
[ 1254.921952][  T352] Call Trace:
[ 1254.936322][  T352]  __schedule+0x768/0x1ba0
[ 1254.955756][  T352]  ? kcore_callback+0x1d/0x1d
[ 1254.976489][  T352]  schedule+0xc1/0x2b0
[ 1254.994120][  T352]  schedule_preempt_disabled+0xc/0x20
[ 1255.019851][  T352]  __mutex_lock+0xa1e/0x1390
[ 1255.042999][  T352]  ? xfs_reclaim_inodes_ag+0x68c/0xb00
[ 1255.068241][  T352]  ? mutex_lock_io_nested+0x1270/0x1270
[ 1255.092922][  T352]  ? rcu_read_lock_sched_held+0xaa/0xd0
[ 1255.117517][  T352]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 1255.141068][  T352]  ? xfs_perag_get_tag+0x463/0x550
[ 1255.163710][  T352]  ? __lock_acquire+0x1920/0x4da0
[ 1255.186068][  T352]  ? xfs_reclaim_inodes_ag+0x68c/0xb00
[ 1255.210506][  T352]  xfs_reclaim_inodes_ag+0x68c/0xb00
[ 1255.233939][  T352]  ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 1255.260536][  T352]  ? xfs_reclaim_inode+0x860/0x860
[ 1255.283185][  T352]  ? print_irqtrace_events+0x270/0x270
[ 1255.307491][  T352]  ? find_held_lock+0x33/0x1c0
[ 1255.328782][  T352]  ? xfs_ail_push_all+0x86/0xd0
[ 1255.350848][  T352]  ? lock_downgrade+0x720/0x720
[ 1255.372513][  T352]  ? rcu_read_unlock+0x50/0x50
[ 1255.393928][  T352]  ? do_raw_spin_lock+0x121/0x290
[ 1255.416127][  T352]  ? rwlock_bug.part.1+0x90/0x90
[ 1255.437997][  T352]  ? do_raw_spin_unlock+0x4f/0x250
[ 1255.460633][  T352]  xfs_reclaim_inodes_nr+0x93/0xd0
[ 1255.483981][  T352]  ? xfs_reclaim_inodes+0x90/0x90
[ 1255.507264][  T352]  ? list_lru_count_one+0x177/0x300
[ 1255.532061][  T352]  super_cache_scan+0x2fd/0x430
[ 1255.556560][  T352]  do_shrink_slab+0x317/0x990
[ 1255.579834][  T352]  shrink_slab+0x141/0x4b0
[ 1255.599330][  T352]  ? do_shrink_slab+0x990/0x990
[ 1255.620934][  T352]  ? mem_cgroup_iter+0x3c3/0x800
[ 1255.642930][  T352]  shrink_node+0x49c/0x17b0
[ 1255.662858][  T352]  do_try_to_free_pages+0x348/0x1250
[ 1255.687403][  T352]  ? shrink_node+0x17b0/0x17b0
[ 1255.710188][  T352]  ? rcu_read_lock_sched_held+0xaa/0xd0
[ 1255.736054][  T352]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 1255.760640][  T352]  try_to_free_pages+0x244/0x690
[ 1255.784242][  T352]  ? free_unref_page_prepare.part.42+0x160/0x170
[ 1255.813938][  T352]  ? reclaim_pages+0x9a0/0x9a0
[ 1255.835127][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1255.861708][  T352]  __alloc_pages_slowpath.constprop.73+0x7b8/0x2410
[ 1255.891312][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1255.918103][  T352]  ? get_page_from_freelist+0x3b8/0x3930
[ 1255.943221][  T352]  ? __lock_acquire+0xc57/0x4da0
[ 1255.965030][  T352]  ? warn_alloc+0x120/0x120
[ 1255.984934][  T352]  ? __isolate_free_page+0x570/0x570
[ 1256.008474][  T352]  __alloc_pages_nodemask+0x658/0x7c0
[ 1256.032319][  T352]  ? handle_mm_fault+0x1ec/0x32f0
[ 1256.056694][  T352]  ? __alloc_pages_slowpath.constprop.73+0x2410/0x2410
[ 1256.089900][  T352]  ? mark_held_locks+0xb0/0x110
[ 1256.111356][  T352]  alloc_pages_vma+0xa5/0x4c0
[ 1256.132001][  T352]  handle_mm_fault+0x1b1e/0x32f0
[ 1256.153941][  T352]  ? copy_page_range+0x6b0/0x6b0
[ 1256.175764][  T352]  do_user_addr_fault+0x2ba/0x90d
[ 1256.198277][  T352]  exc_page_fault+0x56/0xa0
[ 1256.218256][  T352]  ? asm_exc_page_fault+0x8/0x30
[ 1256.240143][  T352]  asm_exc_page_fault+0x1e/0x30
[ 1256.261573][  T352] RIP: 0033:0x402a91
[ 1256.278694][  T352] Code: Bad RIP value.
[ 1256.296570][  T352] RSP: 002b:00007f2470abeec0 EFLAGS: 00010206
[ 1256.323772][  T352] RAX: 00007f2381af2000 RBX: 0000000000000000 RCX: 00007f3a69f4e8c7
[ 1256.360556][  T352] RDX: 00007f214d118000 RSI: 00000003231a6ccc RDI: 0000000000000000
[ 1256.396710][  T352] RBP: 00007f2470abeef0 R08: 00000000ffffffff R09: 0000000000000000
[ 1256.432457][  T352] R10: 0000000000000022 R11: 0000000000000246 R12: 00007ffd23f27bbe
[ 1256.468370][  T352] R13: 00007ffd23f27bbf R14: 0000000000000000 R15: 00007f2470abefc0
[ 1256.505273][  T352] INFO: task kworker/47:0:1815 blocked for more than 129 seconds.
[ 1256.541387][  T352]       Not tainted 5.8.0-rc3-next-20200701 #4
[ 1256.571909][  T352] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1256.614949][  T352] kworker/47:0    D27512  1815      2 0x00004000
[ 1256.642844][  T352] Workqueue: xfs-sync/dm-0 xfs_log_worker
[ 1256.668278][  T352] Call Trace:
[ 1256.682654][  T352]  __schedule+0x768/0x1ba0
[ 1256.702137][  T352]  ? kcore_callback+0x1d/0x1d
[ 1256.722824][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1256.749518][  T352]  ? _raw_spin_unlock_irq+0x1f/0x30
[ 1256.772548][  T352]  ? trace_hardirqs_on+0x20/0x1b5
[ 1256.794901][  T352]  schedule+0xc1/0x2b0
[ 1256.812834][  T352]  schedule_timeout+0x390/0x8f0
[ 1256.834068][  T352]  ? print_irqtrace_events+0x270/0x270
[ 1256.858477][  T352]  ? usleep_range+0x120/0x120
[ 1256.879398][  T352]  ? mark_held_locks+0xb0/0x110
[ 1256.900894][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1256.927501][  T352]  ? _raw_spin_unlock_irq+0x1f/0x30
[ 1256.950506][  T352]  ? trace_hardirqs_on+0x20/0x1b5
[ 1256.972875][  T352]  wait_for_completion+0x15e/0x250
[ 1256.995628][  T352]  ? wait_for_completion_interruptible+0x2f0/0x2f0
[ 1257.025057][  T352]  ? lockdep_hardirqs_on_prepare+0x38c/0x550
[ 1257.051673][  T352]  ? _raw_spin_unlock_irq+0x1f/0x30
[ 1257.076481][  T352]  __flush_work+0x42a/0x930
[ 1257.099817][  T352]  ? queue_delayed_work_on+0x90/0x90
[ 1257.124652][  T352]  ? init_pwq+0x340/0x340
[ 1257.143723][  T352]  ? lockdep_hardirqs_on_prepare+0x550/0x550
[ 1257.170447][  T352]  xlog_cil_force_lsn+0x105/0x520
[ 1257.193048][  T352]  ? xfs_log_commit_cil+0x27f0/0x27f0
[ 1257.216868][  T352]  ? xfs_log_worker+0x70/0x200
[ 1257.237854][  T352]  ? rcu_read_lock_sched_held+0xaa/0xd0
[ 1257.262124][  T352]  ? do_raw_spin_lock+0x121/0x290
[ 1257.284432][  T352]  ? rcu_read_lock_bh_held+0xc0/0xc0
[ 1257.307963][  T352]  ? rwlock_bug.part.1+0x90/0x90
[ 1257.329862][  T352]  ? xfs_log_worker+0x70/0x200
[ 1257.351537][  T352]  xfs_log_force+0x283/0x8b0
[ 1257.372051][  T352]  xfs_log_worker+0x70/0x200
[ 1257.392452][  T352]  process_one_work+0x8ee/0x17c0
[ 1257.414642][  T352]  ? lock_acquire+0x1ac/0xaf0
[ 1257.435337][  T352]  ? pwq_dec_nr_in_flight+0x310/0x310
[ 1257.459227][  T352]  ? do_raw_spin_lock+0x121/0x290
[ 1257.481653][  T352]  worker_thread+0x82/0xb30
[ 1257.501638][  T352]  ? __kthread_parkme+0xcc/0x1a0
[ 1257.523429][  T352]  ? process_one_work+0x17c0/0x17c0
[ 1257.546492][  T352]  kthread+0x358/0x420
[ 1257.564555][  T352]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[ 1257.593534][  T352]  ret_from_fork+0x22/0x30
[ 1257.614307][  T352] 
[ 1257.614307][  T352] Showing all locks held in the system:
[ 1257.653989][  T352] 2 locks held by systemd/1:
[ 1257.674581][  T352]  #0: ffff88a033931658 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1257.719434][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1257.764101][  T352] 2 locks held by kworker/u97:9/270:
[ 1257.787648][  T352]  #0: ffff88a035e6cd38 ((wq_completion)xfs-cil/dm-0){+.+.}-{0:0}, at: process_one_work+0x7c8/0x17c0
[ 1257.837506][  T352]  #1: ffffc900077d7e00 ((work_completion)(&cil->xc_push_work)){+.+.}-{0:0}, at: process_one_work+0x7fc/0x17c0
[ 1257.892615][  T352] 1 lock held by khungtaskd/352:
[ 1257.915428][  T352]  #0: ffffffff905887e0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire.constprop.29+0x0/0x30
[ 1257.964163][  T352] 1 lock held by khugepaged/360:
[ 1257.986155][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.030582][  T352] 1 lock held by kswapd0/366:
[ 1258.051264][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x0/0x30
[ 1258.094152][  T352] 1 lock held by kswapd1/367:
[ 1258.117184][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x0/0x30
[ 1258.161954][  T352] 1 lock held by kswapd2/368:
[ 1258.182634][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x0/0x30
[ 1258.224967][  T352] 1 lock held by kswapd3/369:
[ 1258.245638][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x0/0x30
[ 1258.287356][  T352] 1 lock held by systemd-journal/896:
[ 1258.311276][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.357096][  T352] 1 lock held by systemd-udevd/934:
[ 1258.380380][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.425357][  T352] 1 lock held by gmain/1221:
[ 1258.445231][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.489822][  T352] 1 lock held by irqbalance/1163:
[ 1258.512203][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.556285][  T352] 1 lock held by sshd/1232:
[ 1258.576151][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.622963][  T352] 5 locks held by systemd-logind/1255:
[ 1258.650245][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.696041][  T352]  #1: ffffffff90676850 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x219/0x4b0
[ 1258.738489][  T352]  #2: ffff88a005f040e0 (&type->s_umount_key#28){++++}-{3:3}, at: trylock_super+0x11/0xb0
[ 1258.783155][  T352]  #3: ffff88a00cd3d228 (&pag->pag_ici_reclaim_lock){+.+.}-{3:3}, at: xfs_reclaim_inodes_ag+0x135/0xb00
[ 1258.833254][  T352]  #4: ffff8897c3b57828 (&xfs_nondir_ilock_class){++++}-{3:3}, at: xfs_reclaim_inode+0xdf/0x860
[ 1258.880242][  T352] 2 locks held by agetty/1264:
[ 1258.901291][  T352]  #0: ffff889ff8159378 (&sig->cred_guard_mutex){+.+.}-{3:3}, at: __do_execve_file+0x2f2/0x1a50
[ 1258.947820][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1258.992410][  T352] 1 lock held by crond/1509:
[ 1259.012721][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1259.057395][  T352] 1 lock held by in:imjournal/1523:
[ 1259.080494][  T352]  #0: ffff889fdeaee7c0 (&(&ip->i_mmaplock)->mr_lock){++++}-{3:3}, at: xfs_ilock+0xf3/0x370
[ 1259.127855][  T352] 3 locks held by restraintd/1520:
[ 1259.153828][  T352]  #0: ffff88a00bf4ea30 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x9c/0xb0
[ 1259.194996][  T352]  #1: ffff88a005f04430 (sb_writers#8){.+.+}-{0:0}, at: vfs_write+0x371/0x4d0
[ 1259.234636][  T352]  #2: ffff88838caed050 (&sb->s_type->i_mutex_key#12){++++}-{3:3}, at: xfs_ilock+0x27c/0x370
[ 1259.280335][  T352] 1 lock held by runtest.sh/1567:
[ 1259.302796][  T352]  #0: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1259.347545][  T352] 2 locks held by random/1768:
[ 1259.369261][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1259.414349][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1259.458977][  T352] 2 locks held by random/1769:
[ 1259.480409][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1259.525545][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1259.571368][  T352] 2 locks held by random/1770:
[ 1259.592131][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1259.641941][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1259.689771][  T352] 2 locks held by random/1771:
[ 1259.710979][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1259.757004][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1259.801644][  T352] 2 locks held by random/1772:
[ 1259.822791][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1259.867692][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1259.912339][  T352] 2 locks held by random/1773:
[ 1259.933571][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1259.980366][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1260.026630][  T352] 5 locks held by random/1774:
[ 1260.048812][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1260.094328][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1260.138919][  T352]  #2: ffffffff90676850 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0xb8/0x4b0
[ 1260.184545][  T352]  #3: ffff88a005f040e0 (&type->s_umount_key#28){++++}-{3:3}, at: trylock_super+0x11/0xb0
[ 1260.228764][  T352]  #4: ffff88a00cd3d228 (&pag->pag_ici_reclaim_lock){+.+.}-{3:3}, at: xfs_reclaim_inodes_ag+0x68c/0xb00
[ 1260.278792][  T352] 2 locks held by random/1775:
[ 1260.300785][  T352]  #0: ffff888fbda3ce58 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x167/0x90d
[ 1260.348139][  T352]  #1: ffffffff906d07c0 (fs_reclaim){+.+.}-{0:0}, at: fs_reclaim_acquire.part.64+0x0/0x30
[ 1260.393623][  T352] 1 lock held by awk/1785:
[ 1260.413577][  T352]  #0: ffff8890186467c0 (&(&ip->i_mmaplock)->mr_lock){++++}-{3:3}, at: xfs_ilock+0xf3/0x370
[ 1260.458941][  T352] 2 locks held by kworker/47:0/1815:
[ 1260.482420][  T352]  #0: ffff88a005d8b938 ((wq_completion)xfs-sync/dm-0){+.+.}-{0:0}, at: process_one_work+0x7c8/0x17c0
[ 1260.531638][  T352]  #1: ffffc9000a64fe00 ((work_completion)(&(&log->l_work)->work)){+.+.}-{0:0}, at: process_one_work+0x7fc/0x17c0
[ 1260.585778][  T352] 
[ 1260.595753][  T352] =============================================
[ 1260.595753][  T352] 

> ---
>  block/blk-flush.c  | 17 ++++++-----------
>  block/blk-mq-tag.h | 12 ------------
>  block/blk-mq.c     | 30 ++++++++++++++----------------
>  block/blk.h        |  5 -----
>  4 files changed, 20 insertions(+), 44 deletions(-)
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 21108a550fbf..e756db088d84 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -236,12 +236,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
>  		error = fq->rq_status;
>  
>  	hctx = flush_rq->mq_hctx;
> -	if (!q->elevator) {
> -		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
> -		flush_rq->tag = -1;
> -	} else {
> -		flush_rq->internal_tag = -1;
> -	}
> +	if (!q->elevator)
> +		flush_rq->tag = BLK_MQ_NO_TAG;
> +	else
> +		flush_rq->internal_tag = BLK_MQ_NO_TAG;
>  
>  	running = &fq->flush_queue[fq->flush_running_idx];
>  	BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
> @@ -315,13 +313,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
>  	flush_rq->mq_ctx = first_rq->mq_ctx;
>  	flush_rq->mq_hctx = first_rq->mq_hctx;
>  
> -	if (!q->elevator) {
> -		fq->orig_rq = first_rq;
> +	if (!q->elevator)
>  		flush_rq->tag = first_rq->tag;
> -		blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
> -	} else {
> +	else
>  		flush_rq->internal_tag = first_rq->internal_tag;
> -	}
>  
>  	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
>  	flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 3945c7f5b944..b1acac518c4e 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -101,18 +101,6 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
>  	return atomic_read(&hctx->nr_active) < depth;
>  }
>  
> -/*
> - * This helper should only be used for flush request to share tag
> - * with the request cloned from, and both the two requests can't be
> - * in flight at the same time. The caller has to make sure the tag
> - * can't be freed.
> - */
> -static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
> -		unsigned int tag, struct request *rq)
> -{
> -	hctx->tags->rqs[tag] = rq;
> -}
> -
>  static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
>  					  unsigned int tag)
>  {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index e7a2d2d44b51..a20647139c95 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -277,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  {
>  	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
>  	struct request *rq = tags->static_rqs[tag];
> -	req_flags_t rq_flags = 0;
>  
>  	if (data->q->elevator) {
>  		rq->tag = BLK_MQ_NO_TAG;
>  		rq->internal_tag = tag;
>  	} else {
> -		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
> -			rq_flags = RQF_MQ_INFLIGHT;
> -			atomic_inc(&data->hctx->nr_active);
> -		}
>  		rq->tag = tag;
>  		rq->internal_tag = BLK_MQ_NO_TAG;
> -		data->hctx->tags->rqs[rq->tag] = rq;
>  	}
>  
>  	/* csd/requeue_work/fifo_time is initialized before use */
>  	rq->q = data->q;
>  	rq->mq_ctx = data->ctx;
>  	rq->mq_hctx = data->hctx;
> -	rq->rq_flags = rq_flags;
> +	rq->rq_flags = 0;
>  	rq->cmd_flags = data->cmd_flags;
>  	if (data->flags & BLK_MQ_REQ_PREEMPT)
>  		rq->rq_flags |= RQF_PREEMPT;
> @@ -1127,9 +1121,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
>  {
>  	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
>  	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
> -	bool shared = blk_mq_tag_busy(rq->mq_hctx);
>  	int tag;
>  
> +	blk_mq_tag_busy(rq->mq_hctx);
> +
>  	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
>  		bt = &rq->mq_hctx->tags->breserved_tags;
>  		tag_offset = 0;
> @@ -1142,19 +1137,22 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
>  		return false;
>  
>  	rq->tag = tag + tag_offset;
> -	if (shared) {
> -		rq->rq_flags |= RQF_MQ_INFLIGHT;
> -		atomic_inc(&rq->mq_hctx->nr_active);
> -	}
> -	rq->mq_hctx->tags->rqs[rq->tag] = rq;
>  	return true;
>  }
>  
>  static bool blk_mq_get_driver_tag(struct request *rq)
>  {
> -	if (rq->tag != BLK_MQ_NO_TAG)
> -		return true;
> -	return __blk_mq_get_driver_tag(rq);
> +	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> +
> +	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
> +		return false;
> +
> +	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
> +		rq->rq_flags |= RQF_MQ_INFLIGHT;
> +		atomic_inc(&hctx->nr_active);
> +	}
> +	hctx->tags->rqs[rq->tag] = rq;
> +	return true;
>  }
>  
>  static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
> diff --git a/block/blk.h b/block/blk.h
> index 41a50880c94e..0184a31fe4df 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -25,11 +25,6 @@ struct blk_flush_queue {
>  	struct list_head	flush_data_in_flight;
>  	struct request		*flush_rq;
>  
> -	/*
> -	 * flush_rq shares tag with this rq, both can't be active
> -	 * at the same time
> -	 */
> -	struct request		*orig_rq;
>  	struct lock_class_key	key;
>  	spinlock_t		mq_flush_lock;
>  };
> -- 
> 2.25.2
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH V3 3/3] blk-mq: centralise related handling into blk_mq_get_driver_tag
  2020-07-02  4:37   ` Qian Cai
@ 2020-07-02  6:20     ` Ming Lei
  0 siblings, 0 replies; 7+ messages in thread
From: Ming Lei @ 2020-07-02  6:20 UTC (permalink / raw)
  To: Qian Cai; +Cc: Jens Axboe, linux-block, Christoph Hellwig, m.szyprowski

On Thu, Jul 02, 2020 at 12:37:21AM -0400, Qian Cai wrote:
> On Tue, Jun 30, 2020 at 10:03:57PM +0800, Ming Lei wrote:
> > Move .nr_active update and request assignment into blk_mq_get_driver_tag(),
> > all are good to do during getting driver tag.
> > 
> > Meantime blk-flush related code is simplified and flush request needn't
> > to update the request table manually any more.
> > 
> > Cc: Christoph Hellwig <hch@infradead.org>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> 
> Reverting this commit on the top of next-20200701 fixed an issue where
> swapping is unable to move progress for hours while it will only take
> 5-min after the reverting.

Hi Qian,

Could you apply the following patch and see if it makes a difference?

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 65e0846fd065..e89ce9ae51fd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1147,7 +1147,8 @@ static bool blk_mq_get_driver_tag(struct request *rq)
 	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
 		return false;
 
-	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+	if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
+			!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
 		rq->rq_flags |= RQF_MQ_INFLIGHT;
 		atomic_inc(&hctx->nr_active);
 	}


Thanks,
Ming


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-07-02  6:21 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-30 14:03 [PATCH V3 0/3] blk-mq: driver tag related cleanup Ming Lei
2020-06-30 14:03 ` [PATCH V3 1/3] blk-mq: move blk_mq_get_driver_tag into blk-mq.c Ming Lei
2020-06-30 14:03 ` [PATCH V3 2/3] blk-mq: move blk_mq_put_driver_tag() " Ming Lei
2020-06-30 14:03 ` [PATCH V3 3/3] blk-mq: centralise related handling into blk_mq_get_driver_tag Ming Lei
2020-07-02  4:37   ` Qian Cai
2020-07-02  6:20     ` Ming Lei
2020-06-30 18:58 ` [PATCH V3 0/3] blk-mq: driver tag related cleanup Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.