All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
@ 2016-11-17  1:20 Shaohua Li
  2016-11-17  1:20 ` [PATCH 2/2] raid5-cache: fix lockdep warning Shaohua Li
  2016-11-17  5:18 ` [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot NeilBrown
  0 siblings, 2 replies; 10+ messages in thread
From: Shaohua Li @ 2016-11-17  1:20 UTC (permalink / raw)
  To: linux-raid; +Cc: songliubraving, neilb, Zhengyuan Liu

Currently raid5-cache update superblock in .quiesce. But since at
shutdown/reboot, .quiesce is called with reconfig mutex locked,
superblock isn't guaranteed to be called in reclaim thread (see
8e018c21da3). This will make assemble do unnecessary journal recovery.
It doesn't corrupt data but is annoying.  This adds an extra hook to
guarantee journal is flushed to raid disks.  And since this hook is
called before superblock update, this will guarantee we have a uptodate
superblock in shutdown/reboot

Signed-off-by: Shaohua Li <shli@fb.com>
Cc: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Cc: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c          |  3 +++
 drivers/md/md.h          |  2 ++
 drivers/md/raid5-cache.c | 19 ++++++++++++++++++-
 drivers/md/raid5.c       |  3 +++
 drivers/md/raid5.h       |  1 +
 5 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1f1c7f0..155dd42 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5471,6 +5471,9 @@ static void __md_stop_writes(struct mddev *mddev)
 
 	del_timer_sync(&mddev->safemode_timer);
 
+	if (mddev->pers && mddev->pers->stop_writes)
+		mddev->pers->stop_writes(mddev);
+
 	bitmap_flush(mddev);
 
 	if (mddev->ro == 0 &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index af6b33c..8da6fe9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -512,6 +512,8 @@ struct md_personality
 	/* congested implements bdi.congested_fn().
 	 * Will not be called while array is 'suspended' */
 	int (*congested)(struct mddev *mddev, int bits);
+	/* stop touching raid disks */
+	void (*stop_writes)(struct mddev *mddev);
 };
 
 struct md_sysfs_entry {
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2e270e6..641dec8 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -738,11 +738,13 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 
 static void r5l_do_reclaim(struct r5l_log *log)
 {
+	static DEFINE_MUTEX(lock);
 	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
 	sector_t reclaimable;
 	sector_t next_checkpoint;
 	u64 next_cp_seq;
 
+	mutex_lock(&lock);
 	spin_lock_irq(&log->io_list_lock);
 	/*
 	 * move proper io_unit to reclaim list. We should not change the order.
@@ -769,8 +771,10 @@ static void r5l_do_reclaim(struct r5l_log *log)
 	spin_unlock_irq(&log->io_list_lock);
 
 	BUG_ON(reclaimable < 0);
-	if (reclaimable == 0)
+	if (reclaimable == 0) {
+		mutex_unlock(&lock);
 		return;
+	}
 
 	/*
 	 * write_super will flush cache of each raid disk. We must write super
@@ -784,6 +788,8 @@ static void r5l_do_reclaim(struct r5l_log *log)
 	log->last_cp_seq = next_cp_seq;
 	mutex_unlock(&log->io_mutex);
 
+	mutex_unlock(&lock);
+
 	r5l_run_no_space_stripes(log);
 }
 
@@ -836,6 +842,17 @@ void r5l_quiesce(struct r5l_log *log, int state)
 	}
 }
 
+void r5l_stop_writes(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+	struct r5l_log *log = conf->log;
+
+	if (!log)
+		return;
+	r5l_wake_reclaim(log, -1L);
+	r5l_do_reclaim(log);
+}
+
 bool r5l_log_disk_error(struct r5conf *conf)
 {
 	struct r5l_log *log;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index df88656..3d27338 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7866,6 +7866,7 @@ static struct md_personality raid6_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
 	.congested	= raid5_congested,
+	.stop_writes	= r5l_stop_writes,
 };
 static struct md_personality raid5_personality =
 {
@@ -7889,6 +7890,7 @@ static struct md_personality raid5_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
 	.congested	= raid5_congested,
+	.stop_writes	= r5l_stop_writes,
 };
 
 static struct md_personality raid4_personality =
@@ -7913,6 +7915,7 @@ static struct md_personality raid4_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid4_takeover,
 	.congested	= raid5_congested,
+	.stop_writes	= r5l_stop_writes,
 };
 
 static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 57ec49f..0643cc0 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -633,4 +633,5 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
 extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
 extern void r5l_quiesce(struct r5l_log *log, int state);
 extern bool r5l_log_disk_error(struct r5conf *conf);
+extern void r5l_stop_writes(struct mddev *mddev);
 #endif
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] raid5-cache: fix lockdep warning
  2016-11-17  1:20 [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot Shaohua Li
@ 2016-11-17  1:20 ` Shaohua Li
  2016-11-17  5:18 ` [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot NeilBrown
  1 sibling, 0 replies; 10+ messages in thread
From: Shaohua Li @ 2016-11-17  1:20 UTC (permalink / raw)
  To: linux-raid; +Cc: songliubraving, neilb

lockdep reports warning of the rcu_dereference usage. Using normal rdev
access pattern to avoid the warning.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 641dec8..fe62578 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -987,16 +987,28 @@ static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
 			continue;
 
 		/* in case device is broken */
+		rcu_read_lock();
 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
-		if (rdev)
+		if (rdev) {
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
 			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 				     false);
+			rdev_dec_pending(rdev, rdev->mddev);
+			rcu_read_lock();
+		}
 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
-		if (rrdev)
+		if (rrdev) {
+			atomic_inc(&rrdev->nr_pending);
+			rcu_read_unlock();
 			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 				     false);
+			rdev_dec_pending(rrdev, rrdev->mddev);
+			rcu_read_lock();
+		}
+		rcu_read_unlock();
 	}
 	raid5_release_stripe(sh);
 	return 0;
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-17  1:20 [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot Shaohua Li
  2016-11-17  1:20 ` [PATCH 2/2] raid5-cache: fix lockdep warning Shaohua Li
@ 2016-11-17  5:18 ` NeilBrown
  2016-11-17  9:44   ` Wols Lists
  2016-11-17 18:13   ` Shaohua Li
  1 sibling, 2 replies; 10+ messages in thread
From: NeilBrown @ 2016-11-17  5:18 UTC (permalink / raw)
  To: Shaohua Li, linux-raid; +Cc: songliubraving, Zhengyuan Liu

[-- Attachment #1: Type: text/plain, Size: 5486 bytes --]

On Thu, Nov 17 2016, Shaohua Li wrote:

> Currently raid5-cache update superblock in .quiesce. But since at
> shutdown/reboot, .quiesce is called with reconfig mutex locked,
> superblock isn't guaranteed to be called in reclaim thread (see
> 8e018c21da3). This will make assemble do unnecessary journal recovery.
> It doesn't corrupt data but is annoying.  This adds an extra hook to
> guarantee journal is flushed to raid disks.  And since this hook is
> called before superblock update, this will guarantee we have a uptodate
> superblock in shutdown/reboot

Hi.
I don't quite follow some of the reasoning here.
In particular, the ->stop_writes() that you have implemented
does almost exactly the same thing as r5l_quiesce(1).
So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
You probably need to also call ->quiesce(mddev, 0) to keep things
balanced.

Also you have introduced a static mutex (which isn't my favourite sort
of thing) without giving any explanation why in the changelog comment.
So I cannot easily see if that addition is at all justified.

Thanks,
NeilBrown

>
> Signed-off-by: Shaohua Li <shli@fb.com>
> Cc: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
> Cc: NeilBrown <neilb@suse.com>
> ---
>  drivers/md/md.c          |  3 +++
>  drivers/md/md.h          |  2 ++
>  drivers/md/raid5-cache.c | 19 ++++++++++++++++++-
>  drivers/md/raid5.c       |  3 +++
>  drivers/md/raid5.h       |  1 +
>  5 files changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 1f1c7f0..155dd42 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5471,6 +5471,9 @@ static void __md_stop_writes(struct mddev *mddev)
>  
>  	del_timer_sync(&mddev->safemode_timer);
>  
> +	if (mddev->pers && mddev->pers->stop_writes)
> +		mddev->pers->stop_writes(mddev);
> +
>  	bitmap_flush(mddev);
>  
>  	if (mddev->ro == 0 &&
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index af6b33c..8da6fe9 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -512,6 +512,8 @@ struct md_personality
>  	/* congested implements bdi.congested_fn().
>  	 * Will not be called while array is 'suspended' */
>  	int (*congested)(struct mddev *mddev, int bits);
> +	/* stop touching raid disks */
> +	void (*stop_writes)(struct mddev *mddev);
>  };
>  
>  struct md_sysfs_entry {
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 2e270e6..641dec8 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -738,11 +738,13 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
>  
>  static void r5l_do_reclaim(struct r5l_log *log)
>  {
> +	static DEFINE_MUTEX(lock);
>  	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
>  	sector_t reclaimable;
>  	sector_t next_checkpoint;
>  	u64 next_cp_seq;
>  
> +	mutex_lock(&lock);
>  	spin_lock_irq(&log->io_list_lock);
>  	/*
>  	 * move proper io_unit to reclaim list. We should not change the order.
> @@ -769,8 +771,10 @@ static void r5l_do_reclaim(struct r5l_log *log)
>  	spin_unlock_irq(&log->io_list_lock);
>  
>  	BUG_ON(reclaimable < 0);
> -	if (reclaimable == 0)
> +	if (reclaimable == 0) {
> +		mutex_unlock(&lock);
>  		return;
> +	}
>  
>  	/*
>  	 * write_super will flush cache of each raid disk. We must write super
> @@ -784,6 +788,8 @@ static void r5l_do_reclaim(struct r5l_log *log)
>  	log->last_cp_seq = next_cp_seq;
>  	mutex_unlock(&log->io_mutex);
>  
> +	mutex_unlock(&lock);
> +
>  	r5l_run_no_space_stripes(log);
>  }
>  
> @@ -836,6 +842,17 @@ void r5l_quiesce(struct r5l_log *log, int state)
>  	}
>  }
>  
> +void r5l_stop_writes(struct mddev *mddev)
> +{
> +	struct r5conf *conf = mddev->private;
> +	struct r5l_log *log = conf->log;
> +
> +	if (!log)
> +		return;
> +	r5l_wake_reclaim(log, -1L);
> +	r5l_do_reclaim(log);
> +}
> +
>  bool r5l_log_disk_error(struct r5conf *conf)
>  {
>  	struct r5l_log *log;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index df88656..3d27338 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7866,6 +7866,7 @@ static struct md_personality raid6_personality =
>  	.quiesce	= raid5_quiesce,
>  	.takeover	= raid6_takeover,
>  	.congested	= raid5_congested,
> +	.stop_writes	= r5l_stop_writes,
>  };
>  static struct md_personality raid5_personality =
>  {
> @@ -7889,6 +7890,7 @@ static struct md_personality raid5_personality =
>  	.quiesce	= raid5_quiesce,
>  	.takeover	= raid5_takeover,
>  	.congested	= raid5_congested,
> +	.stop_writes	= r5l_stop_writes,
>  };
>  
>  static struct md_personality raid4_personality =
> @@ -7913,6 +7915,7 @@ static struct md_personality raid4_personality =
>  	.quiesce	= raid5_quiesce,
>  	.takeover	= raid4_takeover,
>  	.congested	= raid5_congested,
> +	.stop_writes	= r5l_stop_writes,
>  };
>  
>  static int __init raid5_init(void)
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 57ec49f..0643cc0 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -633,4 +633,5 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
>  extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
>  extern void r5l_quiesce(struct r5l_log *log, int state);
>  extern bool r5l_log_disk_error(struct r5conf *conf);
> +extern void r5l_stop_writes(struct mddev *mddev);
>  #endif
> -- 
> 2.9.3

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-17  5:18 ` [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot NeilBrown
@ 2016-11-17  9:44   ` Wols Lists
  2016-11-17 18:23     ` Shaohua Li
  2016-11-17 18:13   ` Shaohua Li
  1 sibling, 1 reply; 10+ messages in thread
From: Wols Lists @ 2016-11-17  9:44 UTC (permalink / raw)
  To: NeilBrown, Shaohua Li, linux-raid; +Cc: songliubraving, Zhengyuan Liu

On 17/11/16 05:18, NeilBrown wrote:
> On Thu, Nov 17 2016, Shaohua Li wrote:
> 
>> Currently raid5-cache update superblock in .quiesce. But since at
>> shutdown/reboot, .quiesce is called with reconfig mutex locked,
>> superblock isn't guaranteed to be called in reclaim thread (see
>> 8e018c21da3). This will make assemble do unnecessary journal recovery.
>> It doesn't corrupt data but is annoying.  This adds an extra hook to
>> guarantee journal is flushed to raid disks.  And since this hook is
>> called before superblock update, this will guarantee we have a uptodate
>> superblock in shutdown/reboot
> 
> Hi.
> I don't quite follow some of the reasoning here.
> In particular, the ->stop_writes() that you have implemented
> does almost exactly the same thing as r5l_quiesce(1).
> So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
> You probably need to also call ->quiesce(mddev, 0) to keep things
> balanced.
> 
> Also you have introduced a static mutex (which isn't my favourite sort
> of thing) without giving any explanation why in the changelog comment.
> So I cannot easily see if that addition is at all justified.
> 
> Thanks,
> NeilBrown
> 
I need to be careful I don't ruffle any feathers here ...

But this is saying to me this is a nice feature that hasn't been
properly spec'd and thought through. Don't get me wrong, I know that -
in typical linux fashion - people have been adding things, and raid has
"just growed" topsy fashion. So it's incredibly difficult to spec a new
feature when you don't have a spec for the stuff you're building it on.

Anyways, what I'm saying is, it seems to me this caching stuff (it's a
new feature, iirc) would be great for trying to write out a proper spec
of what's meant to be going on. It'll roll over into spec'ing the stuff
it relies on ...

And yes, I *AM* volunteering to do the work - as I said elsewhere, I
want to put a load of kerneldoc into the raid source, and get to
understand it all, but the downside is you'll get a lot of newbie-ish
questions from me trying to get to grips with what's going on. I'm an
experienced C programmer but kernel style is alien to me - you know the
disconnect when you're reading something, you can read the words easily,
but you can't decipher the meaning. That's how I feel reading the kernel
source at the moment.

Are we up for it?

Cheers,
Wol


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-17  5:18 ` [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot NeilBrown
  2016-11-17  9:44   ` Wols Lists
@ 2016-11-17 18:13   ` Shaohua Li
  2016-11-18  0:01     ` NeilBrown
  1 sibling, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2016-11-17 18:13 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu

On Thu, Nov 17, 2016 at 04:18:15PM +1100, Neil Brown wrote:
> On Thu, Nov 17 2016, Shaohua Li wrote:
> 
> > Currently raid5-cache update superblock in .quiesce. But since at
> > shutdown/reboot, .quiesce is called with reconfig mutex locked,
> > superblock isn't guaranteed to be called in reclaim thread (see
> > 8e018c21da3). This will make assemble do unnecessary journal recovery.
> > It doesn't corrupt data but is annoying.  This adds an extra hook to
> > guarantee journal is flushed to raid disks.  And since this hook is
> > called before superblock update, this will guarantee we have a uptodate
> > superblock in shutdown/reboot
> 
> Hi.
> I don't quite follow some of the reasoning here.
> In particular, the ->stop_writes() that you have implemented
> does almost exactly the same thing as r5l_quiesce(1).
> So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
> You probably need to also call ->quiesce(mddev, 0) to keep things
> balanced.

reboot (md_notify_reboot) doesn't call .quiesce, maybe we should do though. And
in stop, we hold reconfig_mutex before calling .quiesce. And with commit
8e018c21da3, r5l_write_super_and_discard_space tries to hold the reconfig_mutex
before write super, which it can't hold, so superblock write is skipped. After
.quiesce we don't write superblock. To fix the shutdown case, we can add a
superblock write after .quiesce. But I think it's more generic to add a
->stop_writes since it will work for the reboot case.

> Also you have introduced a static mutex (which isn't my favourite sort
> of thing) without giving any explanation why in the changelog comment.
> So I cannot easily see if that addition is at all justified.

Now it's possible both the reclaim thread and the thread calling
__md_stop_writes run into r5l_do_reclaim. The mutex is trying to avoid races.
I'll add comments there.

Thanks,
Shaohua

> 
> Thanks,
> NeilBrown
> 
> >
> > Signed-off-by: Shaohua Li <shli@fb.com>
> > Cc: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
> > Cc: NeilBrown <neilb@suse.com>
> > ---
> >  drivers/md/md.c          |  3 +++
> >  drivers/md/md.h          |  2 ++
> >  drivers/md/raid5-cache.c | 19 ++++++++++++++++++-
> >  drivers/md/raid5.c       |  3 +++
> >  drivers/md/raid5.h       |  1 +
> >  5 files changed, 27 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 1f1c7f0..155dd42 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -5471,6 +5471,9 @@ static void __md_stop_writes(struct mddev *mddev)
> >  
> >  	del_timer_sync(&mddev->safemode_timer);
> >  
> > +	if (mddev->pers && mddev->pers->stop_writes)
> > +		mddev->pers->stop_writes(mddev);
> > +
> >  	bitmap_flush(mddev);
> >  
> >  	if (mddev->ro == 0 &&
> > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > index af6b33c..8da6fe9 100644
> > --- a/drivers/md/md.h
> > +++ b/drivers/md/md.h
> > @@ -512,6 +512,8 @@ struct md_personality
> >  	/* congested implements bdi.congested_fn().
> >  	 * Will not be called while array is 'suspended' */
> >  	int (*congested)(struct mddev *mddev, int bits);
> > +	/* stop touching raid disks */
> > +	void (*stop_writes)(struct mddev *mddev);
> >  };
> >  
> >  struct md_sysfs_entry {
> > diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> > index 2e270e6..641dec8 100644
> > --- a/drivers/md/raid5-cache.c
> > +++ b/drivers/md/raid5-cache.c
> > @@ -738,11 +738,13 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
> >  
> >  static void r5l_do_reclaim(struct r5l_log *log)
> >  {
> > +	static DEFINE_MUTEX(lock);
> >  	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
> >  	sector_t reclaimable;
> >  	sector_t next_checkpoint;
> >  	u64 next_cp_seq;
> >  
> > +	mutex_lock(&lock);
> >  	spin_lock_irq(&log->io_list_lock);
> >  	/*
> >  	 * move proper io_unit to reclaim list. We should not change the order.
> > @@ -769,8 +771,10 @@ static void r5l_do_reclaim(struct r5l_log *log)
> >  	spin_unlock_irq(&log->io_list_lock);
> >  
> >  	BUG_ON(reclaimable < 0);
> > -	if (reclaimable == 0)
> > +	if (reclaimable == 0) {
> > +		mutex_unlock(&lock);
> >  		return;
> > +	}
> >  
> >  	/*
> >  	 * write_super will flush cache of each raid disk. We must write super
> > @@ -784,6 +788,8 @@ static void r5l_do_reclaim(struct r5l_log *log)
> >  	log->last_cp_seq = next_cp_seq;
> >  	mutex_unlock(&log->io_mutex);
> >  
> > +	mutex_unlock(&lock);
> > +
> >  	r5l_run_no_space_stripes(log);
> >  }
> >  
> > @@ -836,6 +842,17 @@ void r5l_quiesce(struct r5l_log *log, int state)
> >  	}
> >  }
> >  
> > +void r5l_stop_writes(struct mddev *mddev)
> > +{
> > +	struct r5conf *conf = mddev->private;
> > +	struct r5l_log *log = conf->log;
> > +
> > +	if (!log)
> > +		return;
> > +	r5l_wake_reclaim(log, -1L);
> > +	r5l_do_reclaim(log);
> > +}
> > +
> >  bool r5l_log_disk_error(struct r5conf *conf)
> >  {
> >  	struct r5l_log *log;
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index df88656..3d27338 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -7866,6 +7866,7 @@ static struct md_personality raid6_personality =
> >  	.quiesce	= raid5_quiesce,
> >  	.takeover	= raid6_takeover,
> >  	.congested	= raid5_congested,
> > +	.stop_writes	= r5l_stop_writes,
> >  };
> >  static struct md_personality raid5_personality =
> >  {
> > @@ -7889,6 +7890,7 @@ static struct md_personality raid5_personality =
> >  	.quiesce	= raid5_quiesce,
> >  	.takeover	= raid5_takeover,
> >  	.congested	= raid5_congested,
> > +	.stop_writes	= r5l_stop_writes,
> >  };
> >  
> >  static struct md_personality raid4_personality =
> > @@ -7913,6 +7915,7 @@ static struct md_personality raid4_personality =
> >  	.quiesce	= raid5_quiesce,
> >  	.takeover	= raid4_takeover,
> >  	.congested	= raid5_congested,
> > +	.stop_writes	= r5l_stop_writes,
> >  };
> >  
> >  static int __init raid5_init(void)
> > diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> > index 57ec49f..0643cc0 100644
> > --- a/drivers/md/raid5.h
> > +++ b/drivers/md/raid5.h
> > @@ -633,4 +633,5 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
> >  extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
> >  extern void r5l_quiesce(struct r5l_log *log, int state);
> >  extern bool r5l_log_disk_error(struct r5conf *conf);
> > +extern void r5l_stop_writes(struct mddev *mddev);
> >  #endif
> > -- 
> > 2.9.3



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-17  9:44   ` Wols Lists
@ 2016-11-17 18:23     ` Shaohua Li
  2016-11-17 19:11       ` Wols Lists
  0 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2016-11-17 18:23 UTC (permalink / raw)
  To: Wols Lists
  Cc: NeilBrown, Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu

On Thu, Nov 17, 2016 at 09:44:39AM +0000, Wols Lists wrote:
> On 17/11/16 05:18, NeilBrown wrote:
> > On Thu, Nov 17 2016, Shaohua Li wrote:
> > 
> >> Currently raid5-cache update superblock in .quiesce. But since at
> >> shutdown/reboot, .quiesce is called with reconfig mutex locked,
> >> superblock isn't guaranteed to be called in reclaim thread (see
> >> 8e018c21da3). This will make assemble do unnecessary journal recovery.
> >> It doesn't corrupt data but is annoying.  This adds an extra hook to
> >> guarantee journal is flushed to raid disks.  And since this hook is
> >> called before superblock update, this will guarantee we have a uptodate
> >> superblock in shutdown/reboot
> > 
> > Hi.
> > I don't quite follow some of the reasoning here.
> > In particular, the ->stop_writes() that you have implemented
> > does almost exactly the same thing as r5l_quiesce(1).
> > So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
> > You probably need to also call ->quiesce(mddev, 0) to keep things
> > balanced.
> > 
> > Also you have introduced a static mutex (which isn't my favourite sort
> > of thing) without giving any explanation why in the changelog comment.
> > So I cannot easily see if that addition is at all justified.
> > 
> > Thanks,
> > NeilBrown
> > 
> I need to be careful I don't ruffle any feathers here ...
> 
> But this is saying to me this is a nice feature that hasn't been
> properly spec'd and thought through. Don't get me wrong, I know that -
> in typical linux fashion - people have been adding things, and raid has
> "just growed" topsy fashion. So it's incredibly difficult to spec a new
> feature when you don't have a spec for the stuff you're building it on.
> 
> Anyways, what I'm saying is, it seems to me this caching stuff (it's a
> new feature, iirc) would be great for trying to write out a proper spec
> of what's meant to be going on. It'll roll over into spec'ing the stuff
> it relies on ...
> 
> And yes, I *AM* volunteering to do the work - as I said elsewhere, I
> want to put a load of kerneldoc into the raid source, and get to
> understand it all, but the downside is you'll get a lot of newbie-ish
> questions from me trying to get to grips with what's going on. I'm an
> experienced C programmer but kernel style is alien to me - you know the
> disconnect when you're reading something, you can read the words easily,
> but you can't decipher the meaning. That's how I feel reading the kernel
> source at the moment.
> 
> Are we up for it?

Yep, that makes sense. the journal (current write-through mode and upcoming
write-back mode) does deserve a description. I'll add something into
Documentation dir in kernel source.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-17 18:23     ` Shaohua Li
@ 2016-11-17 19:11       ` Wols Lists
  0 siblings, 0 replies; 10+ messages in thread
From: Wols Lists @ 2016-11-17 19:11 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu

On 17/11/16 18:23, Shaohua Li wrote:
>> And yes, I *AM* volunteering to do the work - as I said elsewhere, I
>> > want to put a load of kerneldoc into the raid source, and get to
>> > understand it all, but the downside is you'll get a lot of newbie-ish
>> > questions from me trying to get to grips with what's going on. I'm an
>> > experienced C programmer but kernel style is alien to me - you know the
>> > disconnect when you're reading something, you can read the words easily,
>> > but you can't decipher the meaning. That's how I feel reading the kernel
>> > source at the moment.
>> > 
>> > Are we up for it?

> Yep, that makes sense. the journal (current write-through mode and upcoming
> write-back mode) does deserve a description. I'll add something into
> Documentation dir in kernel source.

From what I can make out ... :-)

The new kernel documentation system actually builds a load of stuff into
the Documentation/output directory from the kernel source. In other
words, it'll be far better edited into the source files. And/or put a
.rst file in the md directory.

I need to dig into this, and work out how it all fits together (I'm
having trouble running it on my main system at the moment, gentoo takes
forever to upgrade and the bits I need won't install ...)

https://www.kernel.org/doc/

When you've written it up, post it to the list, and we'll see about
getting it into the new documentation system rather than just dropping
it into the Documentation directory (I think the new system will put it
in Documentation/output/drivers/md, which is lot more sensible than just
jumbled in Documentation along with everything else.

Cheers,
Wol

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-17 18:13   ` Shaohua Li
@ 2016-11-18  0:01     ` NeilBrown
  2016-11-18  1:41       ` Shaohua Li
  0 siblings, 1 reply; 10+ messages in thread
From: NeilBrown @ 2016-11-18  0:01 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu

[-- Attachment #1: Type: text/plain, Size: 2804 bytes --]

On Fri, Nov 18 2016, Shaohua Li wrote:

> On Thu, Nov 17, 2016 at 04:18:15PM +1100, Neil Brown wrote:
>> On Thu, Nov 17 2016, Shaohua Li wrote:
>> 
>> > Currently raid5-cache update superblock in .quiesce. But since at
>> > shutdown/reboot, .quiesce is called with reconfig mutex locked,
>> > superblock isn't guaranteed to be called in reclaim thread (see
>> > 8e018c21da3). This will make assemble do unnecessary journal recovery.
>> > It doesn't corrupt data but is annoying.  This adds an extra hook to
>> > guarantee journal is flushed to raid disks.  And since this hook is
>> > called before superblock update, this will guarantee we have a uptodate
>> > superblock in shutdown/reboot
>> 
>> Hi.
>> I don't quite follow some of the reasoning here.
>> In particular, the ->stop_writes() that you have implemented
>> does almost exactly the same thing as r5l_quiesce(1).
>> So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
>> You probably need to also call ->quiesce(mddev, 0) to keep things
>> balanced.
>
> reboot (md_notify_reboot) doesn't call .quiesce, maybe we should do though. And
> in stop, we hold reconfig_mutex before calling .quiesce. And with commit
> 8e018c21da3, r5l_write_super_and_discard_space tries to hold the reconfig_mutex
> before write super, which it can't hold, so superblock write is skipped. After
> .quiesce we don't write superblock. To fix the shutdown case, we can add a
> superblock write after .quiesce. But I think it's more generic to add a
> ->stop_writes since it will work for the reboot case.

I hadn't quite processed that this was about md_notify_reboot().
I would be very wary of optimizing this code.  It should certainly avoid
data loss, but anything more doesn't belong here.
During a clean shutdown the array should be stopped properly.
md_notify_reboot() is only meant for minimizing damaged caused by a
hasty "reboot -f -n".

A "clean" shutdown currently includes systemd/mdadm.shutdown (in the
mdadm package) running "mdadm --wait-clean --scan".
"mdadm --wait-clean" changes the "safe_mode_delay" so that the array
will become "clean" more quickly.
Possibly we should add something to that to trigger a flush of the
journal, and to wait for the flush to complete.

>
>> Also you have introduced a static mutex (which isn't my favourite sort
>> of thing) without giving any explanation why in the changelog comment.
>> So I cannot easily see if that addition is at all justified.
>
> Now it's possible both the reclaim thread and the thread calling
> __md_stop_writes run into r5l_do_reclaim. The mutex is trying to avoid races.
> I'll add comments there.

It seems to me there is already quite a bit of locking in there... I'll
wait to read the comments though.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-18  0:01     ` NeilBrown
@ 2016-11-18  1:41       ` Shaohua Li
  2016-11-18  3:49         ` NeilBrown
  0 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2016-11-18  1:41 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu

On Fri, Nov 18, 2016 at 11:01:07AM +1100, Neil Brown wrote:
> On Fri, Nov 18 2016, Shaohua Li wrote:
> 
> > On Thu, Nov 17, 2016 at 04:18:15PM +1100, Neil Brown wrote:
> >> On Thu, Nov 17 2016, Shaohua Li wrote:
> >> 
> >> > Currently raid5-cache update superblock in .quiesce. But since at
> >> > shutdown/reboot, .quiesce is called with reconfig mutex locked,
> >> > superblock isn't guaranteed to be called in reclaim thread (see
> >> > 8e018c21da3). This will make assemble do unnecessary journal recovery.
> >> > It doesn't corrupt data but is annoying.  This adds an extra hook to
> >> > guarantee journal is flushed to raid disks.  And since this hook is
> >> > called before superblock update, this will guarantee we have a uptodate
> >> > superblock in shutdown/reboot
> >> 
> >> Hi.
> >> I don't quite follow some of the reasoning here.
> >> In particular, the ->stop_writes() that you have implemented
> >> does almost exactly the same thing as r5l_quiesce(1).
> >> So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
> >> You probably need to also call ->quiesce(mddev, 0) to keep things
> >> balanced.
> >
> > reboot (md_notify_reboot) doesn't call .quiesce, maybe we should do though. And
> > in stop, we hold reconfig_mutex before calling .quiesce. And with commit
> > 8e018c21da3, r5l_write_super_and_discard_space tries to hold the reconfig_mutex
> > before write super, which it can't hold, so superblock write is skipped. After
> > .quiesce we don't write superblock. To fix the shutdown case, we can add a
> > superblock write after .quiesce. But I think it's more generic to add a
> > ->stop_writes since it will work for the reboot case.
> 
> I hadn't quite processed that this was about md_notify_reboot().
> I would be very wary of optimizing this code.  It should certainly avoid
> data loss, but anything more doesn't belong here.
> During a clean shutdown the array should be stopped properly.
> md_notify_reboot() is only meant for minimizing damaged caused by a
> hasty "reboot -f -n".

yep, this isn't the priority. So do you still suggest we ignore the reboot case
and add the journal flush after .quiesce() is called in stop?

> A "clean" shutdown currently includes systemd/mdadm.shutdown (in the
> mdadm package) running "mdadm --wait-clean --scan".
> "mdadm --wait-clean" changes the "safe_mode_delay" so that the array
> will become "clean" more quickly.
> Possibly we should add something to that to trigger a flush of the
> journal, and to wait for the flush to complete.

I'm not sure how we can do this. But adding a different state in 'array_state'
indicating journal isn't clean makes sense.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
  2016-11-18  1:41       ` Shaohua Li
@ 2016-11-18  3:49         ` NeilBrown
  0 siblings, 0 replies; 10+ messages in thread
From: NeilBrown @ 2016-11-18  3:49 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu

[-- Attachment #1: Type: text/plain, Size: 4336 bytes --]

On Fri, Nov 18 2016, Shaohua Li wrote:

> On Fri, Nov 18, 2016 at 11:01:07AM +1100, Neil Brown wrote:
>> On Fri, Nov 18 2016, Shaohua Li wrote:
>> 
>> > On Thu, Nov 17, 2016 at 04:18:15PM +1100, Neil Brown wrote:
>> >> On Thu, Nov 17 2016, Shaohua Li wrote:
>> >> 
>> >> > Currently raid5-cache update superblock in .quiesce. But since at
>> >> > shutdown/reboot, .quiesce is called with reconfig mutex locked,
>> >> > superblock isn't guaranteed to be called in reclaim thread (see
>> >> > 8e018c21da3). This will make assemble do unnecessary journal recovery.
>> >> > It doesn't corrupt data but is annoying.  This adds an extra hook to
>> >> > guarantee journal is flushed to raid disks.  And since this hook is
>> >> > called before superblock update, this will guarantee we have a uptodate
>> >> > superblock in shutdown/reboot
>> >> 
>> >> Hi.
>> >> I don't quite follow some of the reasoning here.
>> >> In particular, the ->stop_writes() that you have implemented
>> >> does almost exactly the same thing as r5l_quiesce(1).
>> >> So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
>> >> You probably need to also call ->quiesce(mddev, 0) to keep things
>> >> balanced.
>> >
>> > reboot (md_notify_reboot) doesn't call .quiesce, maybe we should do though. And
>> > in stop, we hold reconfig_mutex before calling .quiesce. And with commit
>> > 8e018c21da3, r5l_write_super_and_discard_space tries to hold the reconfig_mutex
>> > before write super, which it can't hold, so superblock write is skipped. After
>> > .quiesce we don't write superblock. To fix the shutdown case, we can add a
>> > superblock write after .quiesce. But I think it's more generic to add a
>> > ->stop_writes since it will work for the reboot case.
>> 
>> I hadn't quite processed that this was about md_notify_reboot().
>> I would be very wary of optimizing this code.  It should certainly avoid
>> data loss, but anything more doesn't belong here.
>> During a clean shutdown the array should be stopped properly.
>> md_notify_reboot() is only meant for minimizing damaged caused by a
>> hasty "reboot -f -n".
>
> yep, this isn't the priority. So do you still suggest we ignore the reboot case
> and add the journal flush after .quiesce() is called in stop?

Not sure... is there a problem we are trying to solve?

mddev_detach() calls ->quiesce() so things get cleaned up when the array
is stopped. md_set_readonly() only calls __md_stop_writes() though.
I would probably support adding a ->quiesce() call to __md_stop_writes,
except that r5l_quiesce() registers a new reclaim_thread() so when
__md_stop_writes() is followed by mddev_detach(), the thread would be
killed, then recreated, then killed, then recreated, then finally killed
in r5l_exit_log().
It would be nice if we could either
 1/ not kill the thread, just freeze it, or
 2/ have it start up some other way.
    e.g. get raid5d() to start the thread if it isn't running and
    conf->quiesce is 0, and array isn't clean.
    

>
>> A "clean" shutdown currently includes systemd/mdadm.shutdown (in the
>> mdadm package) running "mdadm --wait-clean --scan".
>> "mdadm --wait-clean" changes the "safe_mode_delay" so that the array
>> will become "clean" more quickly.
>> Possibly we should add something to that to trigger a flush of the
>> journal, and to wait for the flush to complete.
>
> I'm not sure how we can do this. But adding a different state in 'array_state'
> indicating journal isn't clean makes sense.

Possible, though as mdmon uses array_state, we would need to be careful.
Of course mdmon never looks at an array with a journal, so that wouldn't
actually be a problem.

Ignoring the new caching code for the moment, whenever the array is
"clean", the journal is irrelevant.  Possibly we could arrange that if
the array is found to be "clean" on startup, the journal is ignored?
For that to work with the caching code, we would need to hold off
marking the array as 'clean' until the journal is flushed.  I think that
is probably a good idea.
When safe_mode_delay is set to 0 (or 1) we should probably set the cache
flush time down from 30 seconds to (nearly) zero too.  That way, the
current WaitClean() code would continue to work.

NeilBrown

>
> Thanks,
> Shaohua

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-11-18  3:49 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-17  1:20 [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot Shaohua Li
2016-11-17  1:20 ` [PATCH 2/2] raid5-cache: fix lockdep warning Shaohua Li
2016-11-17  5:18 ` [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot NeilBrown
2016-11-17  9:44   ` Wols Lists
2016-11-17 18:23     ` Shaohua Li
2016-11-17 19:11       ` Wols Lists
2016-11-17 18:13   ` Shaohua Li
2016-11-18  0:01     ` NeilBrown
2016-11-18  1:41       ` Shaohua Li
2016-11-18  3:49         ` NeilBrown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.