On Tue, Oct 17 2017, Shaohua Li wrote: > On Tue, Oct 17, 2017 at 01:46:43PM +1100, Neil Brown wrote: >> responding to ->suspend_lo and ->suspend_hi is similar >> to responding to ->suspended. It is best to wait in >> the common core code without incrementing ->active_io. >> This allows mddev_suspend()/mddev_resume() to work while >> requests are waiting for suspend_lo/hi to change. >> This is will be important after a subsequent patch >> which uses mddev_suspend() to synchronize updating for >> suspend_lo/hi. >> >> So move the code for testing suspend_lo/hi out of raid1.c >> and raid5.c, and place it in md.c >> >> Signed-off-by: NeilBrown >> --- >> drivers/md/md.c | 29 +++++++++++++++++++++++------ >> drivers/md/raid1.c | 12 ++++-------- >> drivers/md/raid5.c | 22 ---------------------- >> 3 files changed, 27 insertions(+), 36 deletions(-) >> >> diff --git a/drivers/md/md.c b/drivers/md/md.c >> index 308456842d3e..f8d0e41120cb 100644 >> --- a/drivers/md/md.c >> +++ b/drivers/md/md.c >> @@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock); >> * call has finished, the bio has been linked into some internal structure >> * and so is visible to ->quiesce(), so we don't need the refcount any more. >> */ >> +static bool is_suspended(struct mddev *mddev, struct bio *bio) >> +{ >> + if (mddev->suspended) >> + return true; >> + if (bio_data_dir(bio) != WRITE) >> + return false; >> + if (mddev->suspend_lo >= mddev->suspend_hi) >> + return false; >> + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) >> + return false; >> + if (bio_end_sector(bio) < mddev->suspend_lo) >> + return false; >> + return true; >> +} >> + >> void md_handle_request(struct mddev *mddev, struct bio *bio) >> { >> check_suspended: >> rcu_read_lock(); >> - if (mddev->suspended) { >> + if (is_suspended(mddev, bio)) { >> DEFINE_WAIT(__wait); >> for (;;) { >> prepare_to_wait(&mddev->sb_wait, &__wait, >> TASK_UNINTERRUPTIBLE); >> - if (!mddev->suspended) >> + if (!is_suspended(mddev, bio)) >> break; >> rcu_read_unlock(); >> schedule(); >> @@ -4843,10 +4858,11 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) >> goto unlock; >> old = mddev->suspend_lo; >> mddev->suspend_lo = new; >> - if (new >= old) >> + if (new >= old) { >> /* Shrinking suspended region */ >> + wake_up(&mddev->sb_wait); >> mddev->pers->quiesce(mddev, 2); > > Do we still need the quiesce(mddev, 2)? sounds not to me. No, I don't think we do - thanks for noticing that;. > >> - else { >> + } else { >> /* Expanding suspended region - need to wait */ >> mddev->pers->quiesce(mddev, 1); >> mddev->pers->quiesce(mddev, 0); >> @@ -4886,10 +4902,11 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) >> goto unlock; >> old = mddev->suspend_hi; >> mddev->suspend_hi = new; >> - if (new <= old) >> + if (new <= old) { >> /* Shrinking suspended region */ >> + wake_up(&mddev->sb_wait); >> mddev->pers->quiesce(mddev, 2); >> - else { >> + } else { >> /* Expanding suspended region - need to wait */ >> mddev->pers->quiesce(mddev, 1); >> mddev->pers->quiesce(mddev, 0); >> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c >> index f3f3e40dc9d8..277a145b3ff5 100644 >> --- a/drivers/md/raid1.c >> +++ b/drivers/md/raid1.c >> @@ -1310,11 +1310,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, >> */ >> >> >> - if ((bio_end_sector(bio) > mddev->suspend_lo && >> - bio->bi_iter.bi_sector < mddev->suspend_hi) || >> - (mddev_is_clustered(mddev) && >> + if (mddev_is_clustered(mddev) && >> md_cluster_ops->area_resyncing(mddev, WRITE, >> - bio->bi_iter.bi_sector, bio_end_sector(bio)))) { >> + bio->bi_iter.bi_sector, bio_end_sector(bio))) { >> >> /* >> * As the suspend_* range is controlled by userspace, we want >> @@ -1325,12 +1323,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, >> sigset_t full, old; >> prepare_to_wait(&conf->wait_barrier, >> &w, TASK_INTERRUPTIBLE); >> - if (bio_end_sector(bio) <= mddev->suspend_lo || >> - bio->bi_iter.bi_sector >= mddev->suspend_hi || >> - (mddev_is_clustered(mddev) && >> + if (mddev_is_clustered(mddev) && >> !md_cluster_ops->area_resyncing(mddev, WRITE, >> bio->bi_iter.bi_sector, >> - bio_end_sector(bio)))) >> + bio_end_sector(bio))) >> break; >> sigfillset(&full); >> sigprocmask(SIG_BLOCK, &full, &old); >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c >> index 928e24a07133..4f40ccd21cbb 100644 >> --- a/drivers/md/raid5.c >> +++ b/drivers/md/raid5.c >> @@ -5682,28 +5682,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) >> goto retry; >> } >> >> - if (rw == WRITE && >> - logical_sector >= mddev->suspend_lo && >> - logical_sector < mddev->suspend_hi) { >> - raid5_release_stripe(sh); >> - /* As the suspend_* range is controlled by >> - * userspace, we want an interruptible >> - * wait. >> - */ >> - prepare_to_wait(&conf->wait_for_overlap, >> - &w, TASK_INTERRUPTIBLE); >> - if (logical_sector >= mddev->suspend_lo && >> - logical_sector < mddev->suspend_hi) { >> - sigset_t full, old; >> - sigfillset(&full); >> - sigprocmask(SIG_BLOCK, &full, &old); >> - schedule(); >> - sigprocmask(SIG_SETMASK, &old, NULL); >> - do_prepare = true; >> - } >> - goto retry; >> - } >> - >> if (test_bit(STRIPE_EXPANDING, &sh->state) || >> !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { >> /* Stripe is busy expanding or > > I think we can remove the state == 2 branch in raid5_quiesce(). > Yes. > This leaves only raid1 and md-cluster needs the quiesce(2) calls. In the > future, we probably should abstract that md-cluster logic to separate API and > let raid1 use it for waitting. The quiesce(2) is a strange name for the wait > purpose. Agreed. I'll try to find a nice solution. Thanks, NeilBrown