linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Stephen Rothwell <sfr@canb.auug.org.au>
To: Shaohua Li <shli@kernel.org>, Jens Axboe <axboe@kernel.dk>
Cc: linux-next@vger.kernel.org, linux-kernel@vger.kernel.org,
	Christoph Hellwig <hch@lst.de>, Song Liu <songliubraving@fb.com>
Subject: linux-next: manual merge of the md tree with the block tree
Date: Tue, 22 Nov 2016 15:50:05 +1100	[thread overview]
Message-ID: <20161122155005.1e35f02b@canb.auug.org.au> (raw)

Hi Shaohua,

Today's linux-next merge of the md tree got a conflict in:

  drivers/md/raid5-cache.c

between commit:

  70fd76140a6c ("block,fs: use REQ_* flags directly")

from the block tree and commits:

  b4c625c67362 ("md/r5cache: r5cache recovery: part 1")
  3bddb7f8f264 ("md/r5cache: handle FLUSH and FUA")

from the md tree.

I fixed it up (I think - see below) and can carry the fix as necessary.
This is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/md/raid5-cache.c
index 2bca090cd64e,8cb79fc0eed9..000000000000
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@@ -231,6 -544,64 +544,64 @@@ static void r5l_log_endio(struct bio *b
  
  	if (log->need_cache_flush)
  		md_wakeup_thread(log->rdev->mddev->thread);
+ 
+ 	if (io->has_null_flush) {
+ 		struct bio *bi;
+ 
+ 		WARN_ON(bio_list_empty(&io->flush_barriers));
+ 		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
+ 			bio_endio(bi);
+ 			atomic_dec(&io->pending_stripe);
+ 		}
+ 		if (atomic_read(&io->pending_stripe) == 0)
+ 			__r5l_stripe_write_finished(io);
+ 	}
+ }
+ 
+ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
+ {
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&log->io_list_lock, flags);
+ 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ 	spin_unlock_irqrestore(&log->io_list_lock, flags);
+ 
+ 	if (io->has_flush)
 -		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
++		io->current_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ 	if (io->has_fua)
 -		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
++		io->current_bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+ 	submit_bio(io->current_bio);
+ 
+ 	if (!io->split_bio)
+ 		return;
+ 
+ 	if (io->has_flush)
 -		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
++		io->split_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ 	if (io->has_fua)
 -		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
++		io->split_bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+ 	submit_bio(io->split_bio);
+ }
+ 
+ /* deferred io_unit will be dispatched here */
+ static void r5l_submit_io_async(struct work_struct *work)
+ {
+ 	struct r5l_log *log = container_of(work, struct r5l_log,
+ 					   deferred_io_work);
+ 	struct r5l_io_unit *io = NULL;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&log->io_list_lock, flags);
+ 	if (!list_empty(&log->running_ios)) {
+ 		io = list_first_entry(&log->running_ios, struct r5l_io_unit,
+ 				      log_sibling);
+ 		if (!io->io_deferred)
+ 			io = NULL;
+ 		else
+ 			io->io_deferred = 0;
+ 	}
+ 	spin_unlock_irqrestore(&log->io_list_lock, flags);
+ 	if (io)
+ 		r5l_do_submit_io(log, io);
  }
  
  static void r5l_submit_current_io(struct r5l_log *log)
@@@ -892,82 -1517,138 +1517,138 @@@ static int r5l_recovery_read_meta_block
  	return 0;
  }
  
- static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
- 					 struct r5l_recovery_ctx *ctx,
- 					 sector_t stripe_sect,
- 					 int *offset, sector_t *log_offset)
+ static void
+ r5l_recovery_create_empty_meta_block(struct r5l_log *log,
+ 				     struct page *page,
+ 				     sector_t pos, u64 seq)
  {
- 	struct r5conf *conf = log->rdev->mddev->private;
- 	struct stripe_head *sh;
- 	struct r5l_payload_data_parity *payload;
- 	int disk_index;
+ 	struct r5l_meta_block *mb;
+ 	u32 crc;
  
- 	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
- 	while (1) {
- 		payload = page_address(ctx->meta_page) + *offset;
+ 	mb = page_address(page);
+ 	clear_page(mb);
+ 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
+ 	mb->version = R5LOG_VERSION;
+ 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
+ 	mb->seq = cpu_to_le64(seq);
+ 	mb->position = cpu_to_le64(pos);
+ 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ 	mb->checksum = cpu_to_le32(crc);
+ }
  
- 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
- 			raid5_compute_sector(conf,
- 					     le64_to_cpu(payload->location), 0,
- 					     &disk_index, sh);
+ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
+ 					  u64 seq)
+ {
+ 	struct page *page;
  
- 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
- 				     sh->dev[disk_index].page, REQ_OP_READ, 0,
- 				     false);
- 			sh->dev[disk_index].log_checksum =
- 				le32_to_cpu(payload->checksum[0]);
- 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
- 			ctx->meta_total_blocks += BLOCK_SECTORS;
- 		} else {
- 			disk_index = sh->pd_idx;
- 			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
- 				     sh->dev[disk_index].page, REQ_OP_READ, 0,
- 				     false);
- 			sh->dev[disk_index].log_checksum =
- 				le32_to_cpu(payload->checksum[0]);
- 			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
- 
- 			if (sh->qd_idx >= 0) {
- 				disk_index = sh->qd_idx;
- 				sync_page_io(log->rdev,
- 					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
- 					     PAGE_SIZE, sh->dev[disk_index].page,
- 					     REQ_OP_READ, 0, false);
- 				sh->dev[disk_index].log_checksum =
- 					le32_to_cpu(payload->checksum[1]);
- 				set_bit(R5_Wantwrite,
- 					&sh->dev[disk_index].flags);
- 			}
- 			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
- 		}
+ 	page = alloc_page(GFP_KERNEL);
+ 	if (!page)
+ 		return -ENOMEM;
+ 	r5l_recovery_create_empty_meta_block(log, page, pos, seq);
+ 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
 -			  WRITE_FUA, false)) {
++			  REQ_FUA, false)) {
+ 		__free_page(page);
+ 		return -EIO;
+ 	}
+ 	__free_page(page);
+ 	return 0;
+ }
  
- 		*log_offset = r5l_ring_add(log, *log_offset,
- 					   le32_to_cpu(payload->size));
- 		*offset += sizeof(struct r5l_payload_data_parity) +
- 			sizeof(__le32) *
- 			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
- 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
- 			break;
+ /*
+  * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
+  * to mark valid (potentially not flushed) data in the journal.
+  *
+  * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
+  * so there should not be any mismatch here.
+  */
+ static void r5l_recovery_load_data(struct r5l_log *log,
+ 				   struct stripe_head *sh,
+ 				   struct r5l_recovery_ctx *ctx,
+ 				   struct r5l_payload_data_parity *payload,
+ 				   sector_t log_offset)
+ {
+ 	struct mddev *mddev = log->rdev->mddev;
+ 	struct r5conf *conf = mddev->private;
+ 	int dd_idx;
+ 
+ 	raid5_compute_sector(conf,
+ 			     le64_to_cpu(payload->location), 0,
+ 			     &dd_idx, sh);
+ 	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+ 		     sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
+ 	sh->dev[dd_idx].log_checksum =
+ 		le32_to_cpu(payload->checksum[0]);
+ 	ctx->meta_total_blocks += BLOCK_SECTORS;
+ 
+ 	set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
+ 	set_bit(STRIPE_R5C_CACHING, &sh->state);
+ }
+ 
+ static void r5l_recovery_load_parity(struct r5l_log *log,
+ 				     struct stripe_head *sh,
+ 				     struct r5l_recovery_ctx *ctx,
+ 				     struct r5l_payload_data_parity *payload,
+ 				     sector_t log_offset)
+ {
+ 	struct mddev *mddev = log->rdev->mddev;
+ 	struct r5conf *conf = mddev->private;
+ 
+ 	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
+ 	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+ 		     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
+ 	sh->dev[sh->pd_idx].log_checksum =
+ 		le32_to_cpu(payload->checksum[0]);
+ 	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
+ 
+ 	if (sh->qd_idx >= 0) {
+ 		sync_page_io(log->rdev,
+ 			     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
+ 			     PAGE_SIZE, sh->dev[sh->qd_idx].page,
+ 			     REQ_OP_READ, 0, false);
+ 		sh->dev[sh->qd_idx].log_checksum =
+ 			le32_to_cpu(payload->checksum[1]);
+ 		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
  	}
+ 	clear_bit(STRIPE_R5C_CACHING, &sh->state);
+ }
  
- 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
- 		void *addr;
- 		u32 checksum;
+ static void r5l_recovery_reset_stripe(struct stripe_head *sh)
+ {
+ 	int i;
  
+ 	sh->state = 0;
+ 	sh->log_start = MaxSector;
+ 	for (i = sh->disks; i--; )
+ 		sh->dev[i].flags = 0;
+ }
+ 
+ static void
+ r5l_recovery_replay_one_stripe(struct r5conf *conf,
+ 			       struct stripe_head *sh,
+ 			       struct r5l_recovery_ctx *ctx)
+ {
+ 	struct md_rdev *rdev, *rrdev;
+ 	int disk_index;
+ 	int data_count = 0;
+ 
+ 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
  		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
  			continue;
- 		addr = kmap_atomic(sh->dev[disk_index].page);
- 		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
- 		kunmap_atomic(addr);
- 		if (checksum != sh->dev[disk_index].log_checksum)
- 			goto error;
+ 		if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
+ 			continue;
+ 		data_count++;
  	}
  
- 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
- 		struct md_rdev *rdev, *rrdev;
+ 	/*
+ 	 * stripes that only have parity must have been flushed
+ 	 * before the crash that we are now recovering from, so
+ 	 * there is nothing more to recovery.
+ 	 */
+ 	if (data_count == 0)
+ 		goto out;
  
- 		if (!test_and_clear_bit(R5_Wantwrite,
- 					&sh->dev[disk_index].flags))
+ 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
+ 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
  			continue;
  
  		/* in case device is broken */
@@@ -1031,31 -1981,158 +1981,158 @@@ static int r5c_recovery_flush_log(struc
  		ctx->seq++;
  		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
  	}
+ 
+ 	if (ret == -ENOMEM) {
+ 		r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
+ 		return ret;
+ 	}
+ 
+ 	/* replay data-parity stripes */
+ 	r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
+ 
+ 	/* load data-only stripes to stripe cache */
+ 	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+ 		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
+ 		r5c_recovery_load_one_stripe(log, sh);
+ 		list_del_init(&sh->lru);
+ 		raid5_release_stripe(sh);
+ 		ctx->data_only_stripes++;
+ 	}
+ 
+ 	return 0;
  }
  
- static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
- 					  u64 seq)
+ /*
+  * we did a recovery. Now ctx.pos points to an invalid meta block. New
+  * log will start here. but we can't let superblock point to last valid
+  * meta block. The log might looks like:
+  * | meta 1| meta 2| meta 3|
+  * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
+  * superblock points to meta 1, we write a new valid meta 2n.  if crash
+  * happens again, new recovery will start from meta 1. Since meta 2n is
+  * valid now, recovery will think meta 3 is valid, which is wrong.
+  * The solution is we create a new meta in meta2 with its seq == meta
+  * 1's seq + 10 and let superblock points to meta2. The same recovery will
+  * not think meta 3 is a valid meta, because its seq doesn't match
+  */
+ 
+ /*
+  * Before recovery, the log looks like the following
+  *
+  *   ---------------------------------------------
+  *   |           valid log        | invalid log  |
+  *   ---------------------------------------------
+  *   ^
+  *   |- log->last_checkpoint
+  *   |- log->last_cp_seq
+  *
+  * Now we scan through the log until we see invalid entry
+  *
+  *   ---------------------------------------------
+  *   |           valid log        | invalid log  |
+  *   ---------------------------------------------
+  *   ^                            ^
+  *   |- log->last_checkpoint      |- ctx->pos
+  *   |- log->last_cp_seq          |- ctx->seq
+  *
+  * From this point, we need to increase seq number by 10 to avoid
+  * confusing next recovery.
+  *
+  *   ---------------------------------------------
+  *   |           valid log        | invalid log  |
+  *   ---------------------------------------------
+  *   ^                              ^
+  *   |- log->last_checkpoint        |- ctx->pos+1
+  *   |- log->last_cp_seq            |- ctx->seq+11
+  *
+  * However, it is not safe to start the state machine yet, because data only
+  * parities are not yet secured in RAID. To save these data only parities, we
+  * rewrite them from seq+11.
+  *
+  *   -----------------------------------------------------------------
+  *   |           valid log        | data only stripes | invalid log  |
+  *   -----------------------------------------------------------------
+  *   ^                                                ^
+  *   |- log->last_checkpoint                          |- ctx->pos+n
+  *   |- log->last_cp_seq                              |- ctx->seq+10+n
+  *
+  * If failure happens again during this process, the recovery can safe start
+  * again from log->last_checkpoint.
+  *
+  * Once data only stripes are rewritten to journal, we move log_tail
+  *
+  *   -----------------------------------------------------------------
+  *   |     old log        |    data only stripes    | invalid log  |
+  *   -----------------------------------------------------------------
+  *                        ^                         ^
+  *                        |- log->last_checkpoint   |- ctx->pos+n
+  *                        |- log->last_cp_seq       |- ctx->seq+10+n
+  *
+  * Then we can safely start the state machine. If failure happens from this
+  * point on, the recovery will start from new log->last_checkpoint.
+  */
+ static int
+ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
+ 				       struct r5l_recovery_ctx *ctx)
  {
+ 	struct stripe_head *sh;
+ 	struct mddev *mddev = log->rdev->mddev;
  	struct page *page;
- 	struct r5l_meta_block *mb;
- 	u32 crc;
  
- 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- 	if (!page)
+ 	page = alloc_page(GFP_KERNEL);
+ 	if (!page) {
+ 		pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
+ 		       mdname(mddev));
  		return -ENOMEM;
- 	mb = page_address(page);
- 	mb->magic = cpu_to_le32(R5LOG_MAGIC);
- 	mb->version = R5LOG_VERSION;
- 	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
- 	mb->seq = cpu_to_le64(seq);
- 	mb->position = cpu_to_le64(pos);
- 	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
- 	mb->checksum = cpu_to_le32(crc);
+ 	}
  
- 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
- 			  REQ_FUA, false)) {
- 		__free_page(page);
- 		return -EIO;
+ 	ctx->seq += 10;
+ 	list_for_each_entry(sh, &ctx->cached_list, lru) {
+ 		struct r5l_meta_block *mb;
+ 		int i;
+ 		int offset;
+ 		sector_t write_pos;
+ 
+ 		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
+ 		r5l_recovery_create_empty_meta_block(log, page,
+ 						     ctx->pos, ctx->seq);
+ 		mb = page_address(page);
+ 		offset = le32_to_cpu(mb->meta_size);
+ 		write_pos = ctx->pos + BLOCK_SECTORS;
+ 
+ 		for (i = sh->disks; i--; ) {
+ 			struct r5dev *dev = &sh->dev[i];
+ 			struct r5l_payload_data_parity *payload;
+ 			void *addr;
+ 
+ 			if (test_bit(R5_InJournal, &dev->flags)) {
+ 				payload = (void *)mb + offset;
+ 				payload->header.type = cpu_to_le16(
+ 					R5LOG_PAYLOAD_DATA);
+ 				payload->size = BLOCK_SECTORS;
+ 				payload->location = cpu_to_le64(
+ 					raid5_compute_blocknr(sh, i, 0));
+ 				addr = kmap_atomic(dev->page);
+ 				payload->checksum[0] = cpu_to_le32(
+ 					crc32c_le(log->uuid_checksum, addr,
+ 						  PAGE_SIZE));
+ 				kunmap_atomic(addr);
+ 				sync_page_io(log->rdev, write_pos, PAGE_SIZE,
+ 					     dev->page, REQ_OP_WRITE, 0, false);
+ 				write_pos = r5l_ring_add(log, write_pos,
+ 							 BLOCK_SECTORS);
+ 				offset += sizeof(__le32) +
+ 					sizeof(struct r5l_payload_data_parity);
+ 
+ 			}
+ 		}
+ 		mb->meta_size = cpu_to_le32(offset);
+ 		mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+ 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
 -			     REQ_OP_WRITE, WRITE_FUA, false);
++			     REQ_OP_WRITE, REQ_FUA, false);
+ 		sh->log_start = ctx->pos;
+ 		ctx->pos = write_pos;
+ 		ctx->seq += 1;
  	}
  	__free_page(page);
  	return 0;

             reply	other threads:[~2016-11-22  4:50 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-11-22  4:50 Stephen Rothwell [this message]
  -- strict thread matches above, loose matches on Subject: below --
2018-05-31  4:34 linux-next: manual merge of the md tree with the block tree Stephen Rothwell
2017-08-29  4:13 Stephen Rothwell
2017-06-13  4:33 Stephen Rothwell
2017-04-12  2:31 Stephen Rothwell
2017-04-10  2:08 Stephen Rothwell
2017-04-10  2:03 Stephen Rothwell
2016-11-23  2:17 Stephen Rothwell
2016-06-14  3:52 Stephen Rothwell
2016-06-14  3:52 Stephen Rothwell
2015-08-17  4:44 Stephen Rothwell
2015-08-17  4:44 Stephen Rothwell
2015-08-17  4:44 Stephen Rothwell
2014-01-15  4:07 Stephen Rothwell
2014-01-15  5:30 ` NeilBrown
2012-09-21  2:49 Stephen Rothwell
2011-10-07  3:06 Stephen Rothwell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161122155005.1e35f02b@canb.auug.org.au \
    --to=sfr@canb.auug.org.au \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-next@vger.kernel.org \
    --cc=shli@kernel.org \
    --cc=songliubraving@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).