[PATCH v3 4/9] raid5: calculate partial parity for a stripe

From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
To: linux-raid@vger.kernel.org
Cc: shli@fb.com, neilb@suse.com, jes.sorensen@gmail.com,
	Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Subject: [PATCH v3 4/9] raid5: calculate partial parity for a stripe
Date: Mon, 30 Jan 2017 19:59:48 +0100	[thread overview]
Message-ID: <20170130185953.30428-5-artur.paszkiewicz@intel.com> (raw)
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>

Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.

Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:

- reconstruct-write case:
  xor data from all not updated disks in a stripe

- read-modify-write case:
  xor old data and parity from all updated disks in a stripe

Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.

Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.h |  2 ++
 2 files changed, 100 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d1cba941951e..e1e238da32ba 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -466,6 +466,11 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
+
+	if (sh->ppl_page) {
+		put_page(sh->ppl_page);
+		sh->ppl_page = NULL;
+	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -482,6 +487,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].page = page;
 		sh->dev[i].orig_page = page;
 	}
+
+	if (test_bit(MD_HAS_PPL, &sh->raid_conf->mddev->flags)) {
+		sh->ppl_page = alloc_page(gfp);
+		if (!sh->ppl_page)
+			return 1;
+	}
+
 	return 0;
 }
 
@@ -1977,6 +1989,55 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
+static struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/*
+	 * Partial parity is the XOR of stripe data chunks that are not changed
+	 * during the write request. Depending on available data
+	 * (read-modify-write vs. reconstruct-write case) we calculate it
+	 * differently.
+	 */
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		/* rmw: xor old data and parity from updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+		/* rcw: xor data from all not updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_UPTODATE, &dev->flags))
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		return tx;
+	}
+
+	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, tx, NULL, sh,
+			  flex_array_get(percpu->scribble, 0)
+			  + sizeof(struct page *) * (sh->disks + 2));
+
+	if (count == 1)
+		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+				  &submit);
+	else
+		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+			       &submit);
+
+	return tx;
+}
+
 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
@@ -2007,6 +2068,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -3058,6 +3122,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked++;
 	}
 
+	if (level == 5 && test_bit(MD_HAS_PPL, &conf->mddev->flags) &&
+	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+		set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
 		s->locked, s->ops_request);
@@ -3105,6 +3175,34 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (forwrite && test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
+		/*
+		 * With PPL only writes to consecutive data chunks within a
+		 * stripe are allowed. Not really an overlap, but
+		 * wait_for_overlap can be used to handle this.
+		 */
+		sector_t sector;
+		sector_t first = 0;
+		sector_t last = 0;
+		int count = 0;
+		int i;
+
+		for (i = 0; i < sh->disks; i++) {
+			if (i != sh->pd_idx &&
+			    (i == dd_idx || sh->dev[i].towrite)) {
+				sector = sh->dev[i].sector;
+				if (count == 0 || sector < first)
+					first = sector;
+				if (sector > last)
+					last = sector;
+				count++;
+			}
+		}
+
+		if (first + conf->chunk_sectors * (count - 1) != last)
+			goto overlap;
+	}
+
 	if (!forwrite || previous)
 		clear_bit(STRIPE_BATCH_READY, &sh->state);
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 0f64a58873de..88f1e52d9daf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -228,6 +228,7 @@ struct stripe_head {
 	struct list_head	log_list;
 	sector_t		log_start; /* first meta block on the journal */
 	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
+	struct page		*ppl_page; /* partial parity of this stripe */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -400,6 +401,7 @@ enum {
 	STRIPE_OP_BIODRAIN,
 	STRIPE_OP_RECONSTRUCT,
 	STRIPE_OP_CHECK,
+	STRIPE_OP_PARTIAL_PARITY,
 };
 
 /*
-- 
2.11.0