All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] md/r5cache: simplify handling of sh->log_start in recovery
@ 2016-12-14 23:38 Song Liu
  2016-12-14 23:38 ` [PATCH 2/6] md/r5cache: assign conf->log before r5l_load_log() Song Liu
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Song Liu @ 2016-12-14 23:38 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu

We only need to update sh->log_start at the end of recovery,
which is r5c_recovery_rewrite_data_only_stripes(), so it is not
necessary to set it before that. In this patch, log_start is
removed from r5c_recovery_alloc_stripe().

After updating all sh->log_start, rewrite_data_only_stripes()
also updates log->next_checkpoints to the last sh->log_start.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index d7bfb6f..0f5e6a2 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1682,8 +1682,7 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
 
 static struct stripe_head *
 r5c_recovery_alloc_stripe(struct r5conf *conf,
-			  sector_t stripe_sect,
-			  sector_t log_start)
+			  sector_t stripe_sect)
 {
 	struct stripe_head *sh;
 
@@ -1692,7 +1691,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
 		return NULL;  /* no more stripe available */
 
 	r5l_recovery_reset_stripe(sh);
-	sh->log_start = log_start;
 
 	return sh;
 }
@@ -1862,7 +1860,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 						stripe_sect);
 
 		if (!sh) {
-			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
+			sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
 			/*
 			 * cannot get stripe from raid5_get_active_stripe
 			 * try replay some stripes
@@ -1871,7 +1869,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 				r5c_recovery_replay_stripes(
 					cached_stripe_list, ctx);
 				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+					conf, stripe_sect);
 			}
 			if (!sh) {
 				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
@@ -1879,8 +1877,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 					conf->min_nr_stripes * 2);
 				raid5_set_cache_size(mddev,
 						     conf->min_nr_stripes * 2);
-				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+				sh = r5c_recovery_alloc_stripe(conf,
+							       stripe_sect);
 			}
 			if (!sh) {
 				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
@@ -1894,7 +1892,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
-				sh->log_start = ctx->pos;
 				list_move_tail(&sh->lru, cached_stripe_list);
 			}
 			r5l_recovery_load_data(log, sh, ctx, payload,
@@ -1933,8 +1930,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
 			set_bit(R5_UPTODATE, &dev->flags);
 		}
 	}
-	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
-	atomic_inc(&log->stripe_in_journal_count);
 }
 
 /*
@@ -2070,6 +2065,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 	struct stripe_head *sh, *next;
 	struct mddev *mddev = log->rdev->mddev;
 	struct page *page;
+	sector_t next_checkpoint = MaxSector;
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -2078,6 +2074,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		return -ENOMEM;
 	}
 
+	WARN_ON(list_empty(&ctx->cached_list));
+
 	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
 		struct r5l_meta_block *mb;
 		int i;
@@ -2123,12 +2121,15 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
 			     REQ_OP_WRITE, REQ_FUA, false);
 		sh->log_start = ctx->pos;
+		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
+		atomic_inc(&log->stripe_in_journal_count);
 		ctx->pos = write_pos;
 		ctx->seq += 1;
-
+		next_checkpoint = sh->log_start;
 		list_del_init(&sh->lru);
 		raid5_release_stripe(sh);
 	}
+	log->next_checkpoint = next_checkpoint;
 	__free_page(page);
 	return 0;
 }
@@ -2139,7 +2140,6 @@ static int r5l_recovery_log(struct r5l_log *log)
 	struct r5l_recovery_ctx ctx;
 	int ret;
 	sector_t pos;
-	struct stripe_head *sh;
 
 	ctx.pos = log->last_checkpoint;
 	ctx.seq = log->last_cp_seq;
@@ -2164,9 +2164,6 @@ static int r5l_recovery_log(struct r5l_log *log)
 		log->next_checkpoint = ctx.pos;
 		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
 		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-	} else {
-		sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru);
-		log->next_checkpoint = sh->log_start;
 	}
 
 	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/6] md/r5cache: assign conf->log before r5l_load_log()
  2016-12-14 23:38 [PATCH 1/6] md/r5cache: simplify handling of sh->log_start in recovery Song Liu
@ 2016-12-14 23:38 ` Song Liu
  2016-12-14 23:38 ` [PATCH 3/6] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Song Liu @ 2016-12-14 23:38 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu

r5l_load_log() calls functions that requires a proper conf->log,
for example, r5c_is_writeback(). Therefore, we should set
conf->log before calling r5l_load_log(). If r5l_load_log() fails,
conf->log is set back to NULL.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0f5e6a2..b4f2b82 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2636,14 +2636,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->stripe_in_journal_lock);
 	atomic_set(&log->stripe_in_journal_count, 0);
 
+	rcu_assign_pointer(conf->log, log);
+
 	if (r5l_load_log(log))
 		goto error;
 
-	rcu_assign_pointer(conf->log, log);
 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
 
 error:
+	rcu_assign_pointer(conf->log, NULL);
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
 	mempool_destroy(log->meta_pool);
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/6] md/r5cache: flush data only stripes in r5l_recovery_log()
  2016-12-14 23:38 [PATCH 1/6] md/r5cache: simplify handling of sh->log_start in recovery Song Liu
  2016-12-14 23:38 ` [PATCH 2/6] md/r5cache: assign conf->log before r5l_load_log() Song Liu
@ 2016-12-14 23:38 ` Song Liu
  2016-12-14 23:38 ` [PATCH 4/6] md/r5cache: read data into orig_page for prexor of cached data Song Liu
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Song Liu @ 2016-12-14 23:38 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu

When there is data only stripes in the journal, we flush them out in
r5l_recovery_log(). Ths logic is implemented in a new function:
r5c_recovery_flush_data_only_stripes();

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 58 +++++++++++++++++++++++++++++++++++-------------
 drivers/md/raid5.c       |  9 +++++++-
 drivers/md/raid5.h       |  4 ++++
 3 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b4f2b82..cded2b1 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2062,7 +2062,7 @@ static int
 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 				       struct r5l_recovery_ctx *ctx)
 {
-	struct stripe_head *sh, *next;
+	struct stripe_head *sh;
 	struct mddev *mddev = log->rdev->mddev;
 	struct page *page;
 	sector_t next_checkpoint = MaxSector;
@@ -2076,7 +2076,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 
 	WARN_ON(list_empty(&ctx->cached_list));
 
-	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+	list_for_each_entry(sh, &ctx->cached_list, lru) {
 		struct r5l_meta_block *mb;
 		int i;
 		int offset;
@@ -2126,14 +2126,41 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		ctx->pos = write_pos;
 		ctx->seq += 1;
 		next_checkpoint = sh->log_start;
-		list_del_init(&sh->lru);
-		raid5_release_stripe(sh);
 	}
 	log->next_checkpoint = next_checkpoint;
 	__free_page(page);
 	return 0;
 }
 
+static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
+						 struct r5l_recovery_ctx *ctx)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+	struct stripe_head *sh, *next;
+
+	if (ctx->data_only_stripes == 0)
+		return;
+
+	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
+	set_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+
+	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+		r5c_make_stripe_write_out(sh);
+		list_del_init(&sh->lru);
+		raid5_release_stripe(sh);
+	}
+
+	md_wakeup_thread(conf->mddev->thread);
+	wait_event(conf->wait_for_r5c_pre_init_flush,
+		   atomic_read(&conf->active_stripes) == 0 &&
+		   atomic_read(&conf->r5c_cached_full_stripes) == 0 &&
+		   atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+
+	clear_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+}
+
 static int r5l_recovery_log(struct r5l_log *log)
 {
 	struct mddev *mddev = log->rdev->mddev;
@@ -2160,32 +2187,31 @@ static int r5l_recovery_log(struct r5l_log *log)
 	pos = ctx.pos;
 	ctx.seq += 10000;
 
-	if (ctx.data_only_stripes == 0) {
-		log->next_checkpoint = ctx.pos;
-		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
-		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-	}
 
 	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
 		pr_debug("md/raid:%s: starting from clean shutdown\n",
 			 mdname(mddev));
-	else {
+	else
 		pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
 			 mdname(mddev), ctx.data_only_stripes,
 			 ctx.data_parity_stripes);
 
-		if (ctx.data_only_stripes > 0)
-			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
-				pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
-				       mdname(mddev));
-				return -EIO;
-			}
+	if (ctx.data_only_stripes == 0) {
+		log->next_checkpoint = ctx.pos;
+		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
+		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+	} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+		       mdname(mddev));
+		return -EIO;
 	}
 
 	log->log_start = ctx.pos;
 	log->seq = ctx.seq;
 	log->last_checkpoint = pos;
 	r5l_write_super(log, pos);
+
+	r5c_recovery_flush_data_only_stripes(log, &ctx);
 	return 0;
 }
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 06d7279..b98dfb2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -232,7 +232,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
 	 * data in journal, so they are not released to cached lists
 	 */
-	if (conf->quiesce && r5c_is_writeback(conf->log) &&
+	if ((conf->quiesce ||
+	     test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state)) &&
+	    r5c_is_writeback(conf->log) &&
 	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
 		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 			r5c_make_stripe_write_out(sh);
@@ -264,6 +266,10 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
+		if (test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state) &&
+		    atomic_read(&conf->active_stripes) == 0)
+			wake_up(&sh->raid_conf->wait_for_r5c_pre_init_flush);
+
 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 			if (!r5c_is_writeback(conf->log))
 				list_add_tail(&sh->lru, temp_inactive_list);
@@ -6633,6 +6639,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_for_quiescent);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
+	init_waitqueue_head(&conf->wait_for_r5c_pre_init_flush);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ed8e136..b39fe46 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -564,6 +564,9 @@ enum r5_cache_state {
 	R5C_EXTRA_PAGE_IN_USE,	/* a stripe is using disk_info.extra_page
 				 * for prexor
 				 */
+	R5C_PRE_INIT_FLUSH,	/* flushing data only stripes recovered from
+				 * the journal
+				 */
 };
 
 struct r5conf {
@@ -679,6 +682,7 @@ struct r5conf {
 	int			group_cnt;
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
+	wait_queue_head_t	wait_for_r5c_pre_init_flush;
 };
 
 
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/6] md/r5cache: read data into orig_page for prexor of cached data
  2016-12-14 23:38 [PATCH 1/6] md/r5cache: simplify handling of sh->log_start in recovery Song Liu
  2016-12-14 23:38 ` [PATCH 2/6] md/r5cache: assign conf->log before r5l_load_log() Song Liu
  2016-12-14 23:38 ` [PATCH 3/6] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
@ 2016-12-14 23:38 ` Song Liu
  2016-12-14 23:38 ` [PATCH 5/6] md/raid5: move comment of fetch_block to right location Song Liu
  2016-12-14 23:38 ` [PATCH 6/6] md/r5cache: shift complex rmw from read path to write path Song Liu
  4 siblings, 0 replies; 6+ messages in thread
From: Song Liu @ 2016-12-14 23:38 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu

With write back cache, we use orig_page to do prexor. This patch
makes sure we read data into orig_page for it.

Flag R5_OrigPageUPTDODATE is added to show whether orig_page
has the latest data from raid disk.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c |  2 ++
 drivers/md/raid5.c       | 37 ++++++++++++++++++++++++++++---------
 drivers/md/raid5.h       |  5 +++++
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index cded2b1..e65de05 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2377,6 +2377,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
 			struct page *p = sh->dev[i].orig_page;
 
 			sh->dev[i].orig_page = sh->dev[i].page;
+			clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
 			if (!using_disk_info_extra_page)
 				put_page(p);
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b98dfb2..760af2e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1019,7 +1019,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
 				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
-			sh->dev[i].vec.bv_page = sh->dev[i].page;
+
+			if (!op_is_write(op) &&
+			    test_bit(R5_InJournal, &sh->dev[i].flags) &&
+			    sh->dev[i].page != sh->dev[i].orig_page)
+				sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
+			else
+				sh->dev[i].vec.bv_page = sh->dev[i].page;
 			bi->bi_vcnt = 1;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			bi->bi_io_vec[0].bv_offset = 0;
@@ -2384,6 +2390,10 @@ static void raid5_end_read_request(struct bio * bi)
 		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
 
+		if (test_bit(R5_InJournal, &sh->dev[i].flags) &&
+		    sh->dev[i].page != sh->dev[i].orig_page)
+			set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
 		if (atomic_read(&rdev->read_errors))
 			atomic_set(&rdev->read_errors, 0);
 	} else {
@@ -3598,6 +3608,21 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
+/*
+ * For RMW in write back cache, we need extra page in prexor to store the
+ * old data. This page is stored in dev->orig_page.
+ *
+ * This function checks whether we have data for prexor. The exact logic
+ * is:
+ *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
+ */
+static inline bool uptodate_for_rmw(struct r5dev *dev)
+{
+	return (test_bit(R5_UPTODATE, &dev->flags)) &&
+		(!test_bit(R5_InJournal, &dev->flags) ||
+		 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
+}
+
 static int handle_stripe_dirtying(struct r5conf *conf,
 				  struct stripe_head *sh,
 				  struct stripe_head_state *s,
@@ -3629,9 +3654,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
 		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !((test_bit(R5_UPTODATE, &dev->flags) &&
-		       (!test_bit(R5_InJournal, &dev->flags) ||
-			dev->page != dev->orig_page)) ||
+		    !(uptodate_for_rmw(dev) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
@@ -3643,7 +3666,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		      test_bit(R5_InJournal, &dev->flags) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rcw++;
@@ -3697,9 +3719,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !((test_bit(R5_UPTODATE, &dev->flags) &&
-			       (!test_bit(R5_InJournal, &dev->flags) ||
-				dev->page != dev->orig_page)) ||
+			    !(uptodate_for_rmw(dev) ||
 			      test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (test_bit(STRIPE_PREREAD_ACTIVE,
@@ -3726,7 +3746,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			      test_bit(R5_InJournal, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
 				if (test_bit(R5_Insync, &dev->flags) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index b39fe46..6cc8d4c 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -322,6 +322,11 @@ enum r5dev_flags {
 			 * data and parity being written are in the journal
 			 * device
 			 */
+	R5_OrigPageUPTDODATE,	/* with write back cache, we read old data into
+				 * dev->orig_page for prexor. When this flag is
+				 * set, orig_page contains latest data in the
+				 * raid disk.
+				 */
 };
 
 /*
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/6] md/raid5: move comment of fetch_block to right location
  2016-12-14 23:38 [PATCH 1/6] md/r5cache: simplify handling of sh->log_start in recovery Song Liu
                   ` (2 preceding siblings ...)
  2016-12-14 23:38 ` [PATCH 4/6] md/r5cache: read data into orig_page for prexor of cached data Song Liu
@ 2016-12-14 23:38 ` Song Liu
  2016-12-14 23:38 ` [PATCH 6/6] md/r5cache: shift complex rmw from read path to write path Song Liu
  4 siblings, 0 replies; 6+ messages in thread
From: Song Liu @ 2016-12-14 23:38 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 760af2e..0f25d84 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3309,13 +3309,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
 	return rv;
 }
 
-/* fetch_block - checks the given member device to see if its data needs
- * to be read or computed to satisfy a request.
- *
- * Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill to continue
- */
-
 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 			   int disk_idx, int disks)
 {
@@ -3406,6 +3399,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 	return 0;
 }
 
+/* fetch_block - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill to continue
+ */
 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 		       int disk_idx, int disks)
 {
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 6/6] md/r5cache: shift complex rmw from read path to write path
  2016-12-14 23:38 [PATCH 1/6] md/r5cache: simplify handling of sh->log_start in recovery Song Liu
                   ` (3 preceding siblings ...)
  2016-12-14 23:38 ` [PATCH 5/6] md/raid5: move comment of fetch_block to right location Song Liu
@ 2016-12-14 23:38 ` Song Liu
  4 siblings, 0 replies; 6+ messages in thread
From: Song Liu @ 2016-12-14 23:38 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu

Write back cache requires a complex RMW mechanism, where old data is
read into dev->orig_page for prexor, and then xor is done with
dev->page. This logic is already implemented in the write path.

However, current read path is not awared of this requirement. When
the array is optimal, the RMW is not required, as the data are
read from raid disks. However, when the target stripe is degraded,
complex RMW is required to generate right data.

To keep read path as clean as possible, we handle read path by
flushing degraded, in-journal stripes before processing reads to
missing dev.

Specifically, when there is read requests to a degraded stripe
with data in journal, handle_stripe_fill() calls
r5c_make_stripe_write_out() and exits. Then handle_stripe_dirtying()
will do the complex RMW and flush the stripe to RAID disks. After
that, read requests are handled.

There is one more corner case when there is non-overwrite bio for
the missing (or out of sync) dev. handle_stripe_dirtying() will not
be able to process the non-overwrite bios without constructing the
data in handle_stripe_fill(). This is fixed by delaying non-overwrite
bios in handle_stripe_dirtying(). So handle_stripe_fill() works on
these bios after the stripe is flushed to raid disks.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0f25d84..6d5391f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2894,6 +2894,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
 	return r_sector;
 }
 
+/*
+ * There are cases where we want handle_stripe_dirtying() and
+ * schedule_reconstruction() to delay towrite to some dev of a stripe.
+ *
+ * This function checks whether we want to delay the towrite. Specifically,
+ * we delay the towrite when:
+ *   1. degraded stripe has a non-overwrite to the missing dev, AND this
+ *      stripe has data in journal (for other devices).
+ *
+ *      In this case, when reading data for the non-overwrite dev, it is
+ *      necessary to handle complex rmw of write back cache (prexor with
+ *      orig_page, and xor with page). To keep read path simple, we would
+ *      like to flush data in journal to RAID disks first, so complex rmw
+ *      is handled in the write patch (handle_stripe_dirtying).
+ *
+ *   2. to be added
+ */
+static inline bool delay_towrite(struct r5dev *dev,
+				   struct stripe_head_state *s)
+{
+	return dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
+		!test_bit(R5_Insync, &dev->flags) && s->injournal;
+}
+
 static void
 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
@@ -2914,7 +2938,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 
-			if (dev->towrite) {
+			if (dev->towrite && !delay_towrite(dev, s)) {
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantdrain, &dev->flags);
 				if (!expand)
@@ -3491,10 +3515,25 @@ static void handle_stripe_fill(struct stripe_head *sh,
 	 * midst of changing due to a write
 	 */
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !sh->reconstruct_state)
+	    !sh->reconstruct_state) {
+
+		/* for degraded stripe with data in journal, do not handle
+		 * read requests yet, instead, flush the stripe to raid
+		 * disks first, this avoids handling complex rmw of write
+		 * back cache (prexor with orig_page, and then xor with
+		 * page) in the read path
+		 */
+		if (s->injournal && s->failed) {
+			if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+				r5c_make_stripe_write_out(sh);
+			goto out;
+		}
+
 		for (i = disks; i--; )
 			if (fetch_block(sh, s, i, disks))
 				break;
+	}
+out:
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
@@ -3650,7 +3689,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+		if (((dev->towrite && !delay_towrite(dev, s)) ||
+		     i == sh->pd_idx || i == sh->qd_idx ||
 		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(uptodate_for_rmw(dev) ||
@@ -3714,7 +3754,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite ||
+			if (((dev->towrite && !delay_towrite(dev, s)) ||
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2016-12-14 23:38 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-14 23:38 [PATCH 1/6] md/r5cache: simplify handling of sh->log_start in recovery Song Liu
2016-12-14 23:38 ` [PATCH 2/6] md/r5cache: assign conf->log before r5l_load_log() Song Liu
2016-12-14 23:38 ` [PATCH 3/6] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
2016-12-14 23:38 ` [PATCH 4/6] md/r5cache: read data into orig_page for prexor of cached data Song Liu
2016-12-14 23:38 ` [PATCH 5/6] md/raid5: move comment of fetch_block to right location Song Liu
2016-12-14 23:38 ` [PATCH 6/6] md/r5cache: shift complex rmw from read path to write path Song Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.