All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.com>
To: Shaohua Li <shli@kernel.org>
Cc: Nix <nix@esperi.org.uk>, linux-raid@vger.kernel.org
Subject: [PATCH] md: fix deadlock between mddev_suspend() and md_write_start()
Date: Mon, 05 Jun 2017 16:49:39 +1000	[thread overview]
Message-ID: <87wp8rc4jg.fsf@notabene.neil.brown.name> (raw)
In-Reply-To: <20170530174133.qdj2qsopwug2opp6@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 14893 bytes --]


If mddev_suspend() races with md_write_start() we can deadlock
with mddev_suspend() waiting for the request that is currently
in md_write_start() to complete the ->make_request() call,
and md_write_start() waiting for the metadata to be updated
to mark the array as 'dirty'.
As metadata updates done by md_check_recovery() only happen then
the mddev_lock() can be claimed, and as mddev_suspend() is often
called with the lock held, these threads wait indefinitely for each
other.

We fix this by having md_write_start() abort if mddev_suspend()
is happening, and ->make_request() aborts if md_write_start()
aborted.
md_make_request() can detect this abort, decrease the ->active_io
count, and wait for mddev_suspend().

Reported-by: Nix <nix@esperi.org.uk>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/faulty.c    |  5 +++--
 drivers/md/linear.c    |  7 ++++---
 drivers/md/md.c        | 23 +++++++++++++++++------
 drivers/md/md.h        |  4 ++--
 drivers/md/multipath.c |  8 ++++----
 drivers/md/raid0.c     |  7 ++++---
 drivers/md/raid1.c     | 11 +++++++----
 drivers/md/raid10.c    | 10 ++++++----
 drivers/md/raid5.c     | 17 +++++++++--------
 9 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index b0536cfd8e17..06a64d5d8c6c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
 		conf->nfaults = n+1;
 }
 
-static void faulty_make_request(struct mddev *mddev, struct bio *bio)
+static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct faulty_conf *conf = mddev->private;
 	int failit = 0;
@@ -182,7 +182,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
 			 * just fail immediately
 			 */
 			bio_io_error(bio);
-			return;
+			return true;
 		}
 
 		if (check_sector(conf, bio->bi_iter.bi_sector,
@@ -224,6 +224,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio)
 		bio->bi_bdev = conf->rdev->bdev;
 
 	generic_make_request(bio);
+	return true;
 }
 
 static void faulty_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index df6f2c98eca7..5f1eb9189542 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -245,7 +245,7 @@ static void linear_free(struct mddev *mddev, void *priv)
 	kfree(conf);
 }
 
-static void linear_make_request(struct mddev *mddev, struct bio *bio)
+static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	struct dev_info *tmp_dev;
@@ -254,7 +254,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
-		return;
+		return true;
 	}
 
 	tmp_dev = which_dev(mddev, bio_sector);
@@ -292,7 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 		mddev_check_write_zeroes(mddev, bio);
 		generic_make_request(bio);
 	}
-	return;
+	return true;
 
 out_of_bounds:
 	pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
@@ -302,6 +302,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	       (unsigned long long)tmp_dev->rdev->sectors,
 	       (unsigned long long)start_sector);
 	bio_io_error(bio);
+	return true;
 }
 
 static void linear_status (struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 23f4adf3a8cc..da59051545a2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -277,7 +277,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 		bio_endio(bio);
 		return BLK_QC_T_NONE;
 	}
-	smp_rmb(); /* Ensure implications of  'active' are visible */
+check_suspended:
 	rcu_read_lock();
 	if (mddev->suspended) {
 		DEFINE_WAIT(__wait);
@@ -302,7 +302,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	sectors = bio_sectors(bio);
 	/* bio could be mergeable after passing to underlayer */
 	bio->bi_opf &= ~REQ_NOMERGE;
-	mddev->pers->make_request(mddev, bio);
+	if (!mddev->pers->make_request(mddev, bio)) {
+		atomic_dec(&mddev->active_io);
+		wake_up(&mddev->sb_wait);
+		goto check_suspended;
+	}
 
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -327,6 +331,7 @@ void mddev_suspend(struct mddev *mddev)
 	if (mddev->suspended++)
 		return;
 	synchronize_rcu();
+	wake_up(&mddev->sb_wait);
 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 	mddev->pers->quiesce(mddev, 1);
 
@@ -7950,12 +7955,14 @@ EXPORT_SYMBOL(md_done_sync);
  * If we need to update some array metadata (e.g. 'active' flag
  * in superblock) before writing, schedule a superblock update
  * and wait for it to complete.
+ * A return value of 'false' means that the write wasn't recorded
+ * and cannot proceed as the array is being suspend.
  */
-void md_write_start(struct mddev *mddev, struct bio *bi)
+bool md_write_start(struct mddev *mddev, struct bio *bi)
 {
 	int did_change = 0;
 	if (bio_data_dir(bi) != WRITE)
-		return;
+		return true;
 
 	BUG_ON(mddev->ro == 1);
 	if (mddev->ro == 2) {
@@ -7987,9 +7994,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 	if (did_change)
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	wait_event(mddev->sb_wait,
-		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);
+	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
+		percpu_ref_put(&mddev->writes_pending);
+		return false;
+	}
+	return true;
 }
-EXPORT_SYMBOL(md_write_start);
 
 /* md_write_inc can only be called when md_write_start() has
  * already been called at least once of the current request.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 0fa1de42c42b..63d342d560b8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -510,7 +510,7 @@ struct md_personality
 	int level;
 	struct list_head list;
 	struct module *owner;
-	void (*make_request)(struct mddev *mddev, struct bio *bio);
+	bool (*make_request)(struct mddev *mddev, struct bio *bio);
 	int (*run)(struct mddev *mddev);
 	void (*free)(struct mddev *mddev, void *priv);
 	void (*status)(struct seq_file *seq, struct mddev *mddev);
@@ -649,7 +649,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
 extern int mddev_init_writes_pending(struct mddev *mddev);
-extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern bool md_write_start(struct mddev *mddev, struct bio *bi);
 extern void md_write_inc(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e95d521d93e9..c8d985ba008d 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,7 +106,7 @@ static void multipath_end_request(struct bio *bio)
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
-static void multipath_make_request(struct mddev *mddev, struct bio * bio)
+static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct mpconf *conf = mddev->private;
 	struct multipath_bh * mp_bh;
@@ -114,7 +114,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
-		return;
+		return true;
 	}
 
 	mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@ -126,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	if (mp_bh->path < 0) {
 		bio_io_error(bio);
 		mempool_free(mp_bh, conf->pool);
-		return;
+		return true;
 	}
 	multipath = conf->multipaths + mp_bh->path;
 
@@ -141,7 +141,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	mddev_check_writesame(mddev, &mp_bh->bio);
 	mddev_check_write_zeroes(mddev, &mp_bh->bio);
 	generic_make_request(&mp_bh->bio);
-	return;
+	return true;
 }
 
 static void multipath_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d6c0bc76e837..94d9ae9b0fd0 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -548,7 +548,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	bio_endio(bio);
 }
 
-static void raid0_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct strip_zone *zone;
 	struct md_rdev *tmp_dev;
@@ -559,12 +559,12 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
-		return;
+		return true;
 	}
 
 	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
 		raid0_handle_discard(mddev, bio);
-		return;
+		return true;
 	}
 
 	bio_sector = bio->bi_iter.bi_sector;
@@ -599,6 +599,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	mddev_check_writesame(mddev, bio);
 	mddev_check_write_zeroes(mddev, bio);
 	generic_make_request(bio);
+	return true;
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e1a7e3d4c5e4..c71739b87ab7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1321,7 +1321,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 * Continue immediately if no resync is active currently.
 	 */
 
-	md_write_start(mddev, bio); /* wait on superblock update early */
 
 	if ((bio_end_sector(bio) > mddev->suspend_lo &&
 	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
@@ -1550,13 +1549,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	wake_up(&conf->wait_barrier);
 }
 
-static void raid1_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
 	sector_t sectors;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
-		return;
+		return true;
 	}
 
 	/*
@@ -1571,8 +1570,12 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 
 	if (bio_data_dir(bio) == READ)
 		raid1_read_request(mddev, bio, sectors, NULL);
-	else
+	else {
+		if (!md_write_start(mddev,bio))
+			return false;
 		raid1_write_request(mddev, bio, sectors);
+	}
+	return true;
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 797ed60abd5e..52acffa7a06a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1303,8 +1303,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	sector_t sectors;
 	int max_sectors;
 
-	md_write_start(mddev, bio);
-
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
@@ -1525,7 +1523,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 		raid10_write_request(mddev, bio, r10_bio);
 }
 
-static void raid10_make_request(struct mddev *mddev, struct bio *bio)
+static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
 	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
@@ -1534,9 +1532,12 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
-		return;
+		return true;
 	}
 
+	if (!md_write_start(mddev, bio))
+		return false;
+
 	/*
 	 * If this request crosses a chunk boundary, we need to split
 	 * it.
@@ -1553,6 +1554,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
+	return true;
 }
 
 static void raid10_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 861fc9a8d1be..ad1a644a17bc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5469,7 +5469,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
 	bi->bi_next = NULL;
-	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
 		(conf->raid_disks - conf->max_degraded);
@@ -5539,11 +5538,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		release_stripe_plug(mddev, sh);
 	}
 
-	md_write_end(mddev);
 	bio_endio(bi);
 }
 
-static void raid5_make_request(struct mddev *mddev, struct bio * bi)
+static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 {
 	struct r5conf *conf = mddev->private;
 	int dd_idx;
@@ -5559,10 +5557,10 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		int ret = r5l_handle_flush_request(conf->log, bi);
 
 		if (ret == 0)
-			return;
+			return true;
 		if (ret == -ENODEV) {
 			md_flush_request(mddev, bi);
-			return;
+			return true;
 		}
 		/* ret == -EAGAIN, fallback */
 		/*
@@ -5572,6 +5570,8 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
+	if (!md_write_start(mddev, bi))
+		return false;
 	/*
 	 * If array is degraded, better not do chunk aligned read because
 	 * later we might have to read it again in order to reconstruct
@@ -5581,18 +5581,18 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	    mddev->reshape_position == MaxSector) {
 		bi = chunk_aligned_read(mddev, bi);
 		if (!bi)
-			return;
+			return true;
 	}
 
 	if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
 		make_discard_request(mddev, bi);
-		return;
+		md_write_end(mddev);
+		return true;
 	}
 
 	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
-	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5730,6 +5730,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	if (rw == WRITE)
 		md_write_end(mddev);
 	bio_endio(bi);
+	return true;
 }
 
 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
-- 
2.12.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

  reply	other threads:[~2017-06-05  6:49 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-05-22  9:13 4.11.2: reshape raid5 -> raid6 atop bcache deadlocks at start on md_attr_store / raid5_make_request Nix
2017-05-22 11:35 ` NeilBrown
2017-05-22 15:30   ` Nix
2017-05-22 19:07     ` Wols Lists
2017-05-22 20:43       ` Nix
2017-05-23  1:20         ` NeilBrown
2017-05-23 10:10           ` Nix
2017-05-23  1:39       ` NeilBrown
2017-05-23 14:47         ` Wols Lists
2017-05-24  1:50           ` NeilBrown
2017-05-23  1:07     ` NeilBrown
2017-05-22 21:38   ` Nix
2017-05-23 14:16     ` Wols Lists
2017-05-23 15:00       ` Nix
2017-05-24  1:24     ` NeilBrown
2017-05-24 13:28       ` Nix
2017-05-25  1:31         ` NeilBrown
2017-05-25 12:14           ` Nix
2017-05-24 19:42       ` Nix
2017-05-24 22:57       ` Shaohua Li
2017-05-25  1:30         ` NeilBrown
2017-05-25  1:46           ` Shaohua Li
2017-05-26  3:23             ` NeilBrown
2017-05-26 16:40               ` Shaohua Li
2017-05-28 23:17         ` NeilBrown
2017-05-30 17:41           ` Shaohua Li
2017-06-05  6:49             ` NeilBrown [this message]
2017-06-06  0:01               ` [PATCH] md: fix deadlock between mddev_suspend() and md_write_start() Shaohua Li
2017-06-07  1:45                 ` NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87wp8rc4jg.fsf@notabene.neil.brown.name \
    --to=neilb@suse.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=nix@esperi.org.uk \
    --cc=shli@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.