All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9]raid5: fix write hole
@ 2015-07-30  0:38 Shaohua Li
  2015-07-30  0:38 ` [PATCH 1/9] MD: add a new disk role to present cache device Shaohua Li
                   ` (8 more replies)
  0 siblings, 9 replies; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

Neil,

These are the patches to fix write hole issue. It did everything you requested.
- one meta data type format
- no super block
- format suitable for future extention of caching.
- just fix write hole issue, no caching support

The recovery doesn't use stripe cache yet, need further tweak of the stripe
cache code. I hope it's easy enough. Please let me know how you think.

Thanks,
Shaohua

Shaohua Li (8):
  md: override md superblock recovery_offset for cache device
  raid5: add basic stripe log
  raid5: log reclaim support
  raid5: log recovery
  raid5: disable batch with log enabled
  raid5: don't allow resize/reshape with cache(log) support
  raid5: enable log for raid array with cache disk
  raid5: skip resync if cache(log) is enabled

Song Liu (1):
  MD: add a new disk role to present cache device

 drivers/md/Makefile            |    2 +-
 drivers/md/md.c                |   28 +-
 drivers/md/md.h                |    4 +
 drivers/md/raid5-cache.c       | 1242 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c             |   51 +-
 drivers/md/raid5.h             |   19 +
 include/uapi/linux/raid/md_p.h |   51 ++
 7 files changed, 1388 insertions(+), 9 deletions(-)
 create mode 100644 drivers/md/raid5-cache.c

-- 
1.8.5.6


^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 1/9] MD: add a new disk role to present cache device
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-08-04 14:28   ` Christoph Hellwig
  2015-08-05  1:05   ` NeilBrown
  2015-07-30  0:38 ` [PATCH 2/9] md: override md superblock recovery_offset for " Shaohua Li
                   ` (7 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

From: Song Liu <songliubraving@fb.com>

Next patches will use a disk as raid5/6 caching. We need a new disk role
to present the cache device and add MD_FEATURE_WRITE_CACHE to
feature_map for backward compability.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c                | 24 ++++++++++++++++++++++--
 drivers/md/md.h                |  4 ++++
 include/uapi/linux/raid/md_p.h |  3 +++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d429c30..fd84f16 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1656,6 +1656,16 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		case 0xfffe: /* faulty */
 			set_bit(Faulty, &rdev->flags);
 			break;
+		case 0xfffd: /* cache device */
+			if (!(sb->feature_map & MD_FEATURE_WRITE_CACHE)) {
+				/* cache device without cache feature */
+				printk(KERN_WARNING
+				  "md: cache device provided without write "
+				  "cache feature, ignoring the device\n");
+				return -EINVAL;
+			}
+			set_bit(WriteCache, &rdev->flags);
+			break;
 		default:
 			rdev->saved_raid_disk = role;
 			if ((le32_to_cpu(sb->feature_map) &
@@ -1811,7 +1821,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
 		else if (test_bit(In_sync, &rdev2->flags))
 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
-		else if (rdev2->raid_disk >= 0)
+		else if (test_bit(WriteCache, &rdev2->flags)) {
+			sb->dev_roles[i] = cpu_to_le16(0xfffd);
+			sb->feature_map |= cpu_to_le32(MD_FEATURE_WRITE_CACHE);
+		} else if (rdev2->raid_disk >= 0)
 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
 		else
 			sb->dev_roles[i] = cpu_to_le16(0xffff);
@@ -5803,7 +5816,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
 		else if (test_bit(In_sync, &rdev->flags)) {
 			info.state |= (1<<MD_DISK_ACTIVE);
 			info.state |= (1<<MD_DISK_SYNC);
-		}
+		} else if (test_bit(WriteCache, &rdev->flags))
+			info.state |= (1<<MD_DISK_WRITECACHE);
 		if (test_bit(WriteMostly, &rdev->flags))
 			info.state |= (1<<MD_DISK_WRITEMOSTLY);
 	} else {
@@ -5918,6 +5932,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		else
 			clear_bit(WriteMostly, &rdev->flags);
 
+		if (info->state & (1<<MD_DISK_WRITECACHE))
+			set_bit(WriteCache, &rdev->flags);
 		/*
 		 * check whether the device shows up in other nodes
 		 */
@@ -7286,6 +7302,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
 				seq_printf(seq, "(F)");
 				continue;
 			}
+			if (test_bit(WriteCache, &rdev->flags)) {
+				seq_printf(seq, "(C)");
+				continue;
+			}
 			if (rdev->raid_disk < 0)
 				seq_printf(seq, "(S)"); /* spare */
 			if (test_bit(Replacement, &rdev->flags))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7da6e9c..a9f27db 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -176,6 +176,10 @@ enum flag_bits {
 				 * This device is seen locally but not
 				 * by the whole cluster
 				 */
+	WriteCache,		/* This device is used as write cache.
+				 * Usually, this device should be faster
+				 * than other devices in the array
+				 */
 };
 
 #define BB_LEN_MASK	(0x00000000000001FFULL)
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 2ae6131..8c8e12c 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -89,6 +89,7 @@
 				   * read requests will only be sent here in
 				   * dire need
 				   */
+#define MD_DISK_WRITECACHE      18 /* disk is used as the write cache in RAID-5/6 */
 
 typedef struct mdp_device_descriptor_s {
 	__u32 number;		/* 0 Device number in the entire set	      */
@@ -302,6 +303,7 @@ struct mdp_superblock_1 {
 #define	MD_FEATURE_RECOVERY_BITMAP	128 /* recovery that is happening
 					     * is guided by bitmap.
 					     */
+#define	MD_FEATURE_WRITE_CACHE		256 /* support write cache */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -310,6 +312,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_RESHAPE_BACKWARDS	\
 					|MD_FEATURE_NEW_OFFSET		\
 					|MD_FEATURE_RECOVERY_BITMAP	\
+					|MD_FEATURE_WRITE_CACHE		\
 					)
 
 #endif
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 2/9] md: override md superblock recovery_offset for cache device
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
  2015-07-30  0:38 ` [PATCH 1/9] MD: add a new disk role to present cache device Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-08-04 14:30   ` Christoph Hellwig
  2015-08-05  1:08   ` NeilBrown
  2015-07-30  0:38 ` [PATCH 3/9] raid5: add basic stripe log Shaohua Li
                   ` (6 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

Cache device stores data in a log structure. We need record the log
start. Here we override md superblock recovery_offset for this purpose.
This field of a cache device is meaningless otherwise.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index fd84f16..9861f34 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1664,6 +1664,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 				  "cache feature, ignoring the device\n");
 				return -EINVAL;
 			}
+			rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
 			set_bit(WriteCache, &rdev->flags);
 			break;
 		default:
@@ -1830,6 +1831,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 			sb->dev_roles[i] = cpu_to_le16(0xffff);
 	}
 
+	if (test_bit(WriteCache, &rdev->flags))
+		sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+
 	sb->sb_csum = calc_sb_1_csum(sb);
 }
 
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 3/9] raid5: add basic stripe log
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
  2015-07-30  0:38 ` [PATCH 1/9] MD: add a new disk role to present cache device Shaohua Li
  2015-07-30  0:38 ` [PATCH 2/9] md: override md superblock recovery_offset for " Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-08-05  3:07   ` NeilBrown
  2015-07-30  0:38 ` [PATCH 4/9] raid5: log reclaim support Shaohua Li
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

This introduces a simple log for raid5. Data/parity writting to raid
array first writes to the log, then write to raid array disks. If crash
happens, we can recovery data from the log. This can speed up raid
resync and fix write hole issue.

The log structure is pretty simple. Data/meta data is stored in block
unit, which is 4k generally. It has only one type of meta data block.
The meta data block can track 3 types of data, stripe data, stripe
parity and flush block. MD superblock will point to the last valid meta
data block. Each meta data block has checksum/seq number, so recovery
can scan the log correctly. We store a checksum of stripe data/parity to
the metadata block, so meta data and stripe data/parity can be written
to log disk together. otherwise, meta data write must wait till stripe
data/parity is finished.

For stripe data, meta data block will record stripe data sector and
size. Currently the size is always 4k. This meta data record can be made
simpler if we just fix write hole (eg, we can record data of a stripe's
different disks together), but this format can be extended to support
caching in the future, which must record data address/size.

For stripe parity, meta data block will record stripe sector. It's size
should be 4k (for raid5) or 8k (for raid6). We always store p parity
first. This format should work for caching too.

flush block indicates a stripe is in raid array disks. Fixing write hole
doesn't need this type of meta data, it's for caching extention.

We should be careful about deadlock. Appending stripe data/parity to log
is done in raid5d. The append need log space, which can trigger log
reclaim, which might wait for raid5d to run (to flush data from log to
raid array disks). Here we always do the log write in a separate thread.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/Makefile            |   2 +-
 drivers/md/raid5-cache.c       | 677 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c             |   8 +-
 drivers/md/raid5.h             |  10 +
 include/uapi/linux/raid/md_p.h |  48 +++
 5 files changed, 741 insertions(+), 4 deletions(-)
 create mode 100644 drivers/md/raid5-cache.c

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 462f443..f34979c 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -17,7 +17,7 @@ dm-cache-smq-y   += dm-cache-policy-smq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o
+raid456-y	+= raid5.o raid5-cache.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
new file mode 100644
index 0000000..5c9bab6
--- /dev/null
+++ b/drivers/md/raid5-cache.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (C) 2015 Shaohua Li <shli@fb.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/raid/md_p.h>
+#include <linux/crc32.h>
+#include <linux/random.h>
+#include "md.h"
+#include "raid5.h"
+
+typedef u64 r5blk_t; /* log blocks, 512B - 4k */
+
+struct r5l_log {
+	struct mddev *mddev;
+	struct md_rdev *rdev;
+
+	u32 uuid_checksum;
+
+	unsigned int block_size; /* bytes */
+	unsigned int block_sector_shift;
+	unsigned int page_block_shift; /* page to block */
+
+	r5blk_t total_blocks;
+	r5blk_t first_block;
+	r5blk_t last_block;
+
+	r5blk_t last_checkpoint; /* log tail */
+	u64 last_cp_seq; /* log tail sequence */
+
+	u64 seq; /* current sequence */
+	r5blk_t log_start; /* current log position */
+
+	r5blk_t reserved_blocks;
+	wait_queue_head_t space_waitq;
+
+	struct mutex io_mutex;
+	struct r5l_io_unit *current_io;
+
+	spinlock_t io_list_lock;
+	struct list_head running_ios; /* running io_unit list */
+
+	struct kmem_cache *io_kc;
+
+	struct md_thread *log_thread;
+	struct list_head log_stripes;
+	spinlock_t log_stripes_lock;
+};
+
+/*
+ * an IO range starts from a meta data block and end at the next meta data
+ * block. The io unit's the meta data block tracks data/parity followed it. io
+ * unit is written to log disk with normal write, as we always flush log disk
+ * first and then start move data to raid disks, there is no requirement to
+ * write io unit with FLUSH/FUA
+ * */
+struct r5l_io_unit {
+	struct r5l_log *log;
+
+	struct page *meta_page; /* store meta block */
+	int meta_offset; /* current offset in meta_page */
+
+	struct bio_list bios;
+	struct bio *current_bio; /* current_bio accepting pages */
+	atomic_t pending_io; /* pending bios not writting to log */
+
+	atomic_t pending_stripe; /* how many stripes not flushed to raid */
+	u64 seq;
+	r5blk_t log_start; /* where the io_unit starts */
+	r5blk_t log_end; /* where the io_unit ends */
+	struct list_head log_sibling; /* log->running_ios */
+	struct list_head stripe_list; /* stripes added to the io_unit */
+	int state;
+	wait_queue_head_t wait_state;
+};
+
+/* r5l_io_unit state */
+enum {
+	IO_UNIT_RUNNING = 0, /* accepting new IO */
+	IO_UNIT_IO_START = 1, /* io_unit is writting to log */
+	IO_UNIT_IO_END = 2, /* io_unit is in log */
+	IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are running */
+	IO_UNIT_STRIPE_END = 4, /* stripes data are in raid disks */
+};
+
+#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
+
+static inline struct block_device *r5l_bdev(struct r5l_log *log)
+{
+	return log->rdev->bdev;
+}
+
+static inline sector_t r5l_block_to_sector(struct r5l_log *log, r5blk_t block)
+{
+	return block << log->block_sector_shift;
+}
+
+static inline r5blk_t r5l_sector_to_block(struct r5l_log *log, sector_t s)
+{
+	return s >> log->block_sector_shift;
+}
+
+static inline int r5l_page_blocks(struct r5l_log *log, int pages)
+{
+	return pages << log->page_block_shift;
+}
+
+static u32 r5l_calculate_checksum(struct r5l_log *log, u32 crc,
+	void *buf, size_t size)
+{
+	return crc32_le(crc, buf, size);
+}
+
+static r5blk_t r5l_ring_add(struct r5l_log *log, r5blk_t block, int inc)
+{
+	block += inc;
+	if (block >= log->last_block)
+		block = block - log->total_blocks;
+	return block;
+}
+
+static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space);
+static r5blk_t r5l_free_space(struct r5l_log *log)
+{
+	r5blk_t used_size;
+
+	if (log->log_start >= log->last_checkpoint)
+		used_size = log->log_start - log->last_checkpoint;
+	else
+		used_size = log->log_start + log->total_blocks -
+			log->last_checkpoint;
+
+	if (log->total_blocks > used_size + log->reserved_blocks)
+		return log->total_blocks - used_size - log->reserved_blocks;
+	return 0;
+}
+
+/* Make sure we have enough free space in log device */
+static void r5l_get_reserve(struct r5l_log *log, unsigned int size)
+{
+	r5blk_t free;
+
+	BUG_ON(!mutex_is_locked(&log->io_mutex));
+	free = r5l_free_space(log);
+	if (free >= size) {
+		log->reserved_blocks += size;
+		return;
+	}
+
+	log->reserved_blocks += size;
+	mutex_unlock(&log->io_mutex);
+
+	r5l_wake_reclaim(log, size);
+
+	mutex_lock(&log->io_mutex);
+	wait_event_cmd(log->space_waitq, r5l_free_space(log) > 0,
+		mutex_unlock(&log->io_mutex), mutex_lock(&log->io_mutex));
+}
+
+static void r5l_put_reserve(struct r5l_log *log, unsigned int reserved_blocks)
+{
+	BUG_ON(!mutex_is_locked(&log->io_mutex));
+
+	log->reserved_blocks -= reserved_blocks;
+	if (r5l_free_space(log) > 0)
+		wake_up(&log->space_waitq);
+}
+
+static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
+{
+	struct r5l_io_unit *io;
+	gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
+
+	io = kmem_cache_zalloc(log->io_kc, gfp);
+	if (!io)
+		return NULL;
+	io->log = log;
+	io->meta_page = alloc_page(gfp | __GFP_ZERO);
+	if (!io->meta_page) {
+		kmem_cache_free(log->io_kc, io);
+		return NULL;
+	}
+
+	bio_list_init(&io->bios);
+	atomic_set(&io->pending_io, 1);
+	INIT_LIST_HEAD(&io->log_sibling);
+	INIT_LIST_HEAD(&io->stripe_list);
+	io->state = IO_UNIT_RUNNING;
+	init_waitqueue_head(&io->wait_state);
+	return io;
+}
+
+static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
+{
+	__free_page(io->meta_page);
+	kmem_cache_free(log->io_kc, io);
+}
+
+static void r5l_wait_io_unit_state(struct r5l_io_unit *io, int state)
+{
+	wait_event(io->wait_state, io->state >= state);
+}
+
+static void r5l_set_io_unit_state(struct r5l_io_unit *io, int state)
+{
+	if (io->state >= state)
+		return;
+	io->state = state;
+	wake_up(&io->wait_state);
+}
+
+static void r5l_submit_bio(struct r5l_log *log, int rw, struct bio *bio)
+{
+	/* all IO must start from rdev->data_offset */
+	bio->bi_iter.bi_sector += log->rdev->data_offset;
+	submit_bio(rw, bio);
+}
+
+static void r5l_io_unit_ioend(struct r5l_io_unit *io, int error)
+{
+	struct r5l_log *log = io->log;
+	if (!atomic_dec_and_test(&io->pending_io))
+		return;
+
+	r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+	md_wakeup_thread(log->mddev->thread);
+}
+
+static void r5l_log_endio(struct bio *bio, int error)
+{
+	struct r5l_io_unit *io = bio->bi_private;
+
+	bio_put(bio);
+	r5l_io_unit_ioend(io, error);
+}
+
+static void r5l_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
+{
+	struct bio *bio;
+	while ((bio = bio_list_pop(&io->bios)))
+		r5l_submit_bio(log, WRITE, bio);
+
+	r5l_io_unit_ioend(io, 0);
+}
+
+static void r5l_submit_current_io(struct r5l_log *log)
+{
+	struct r5l_io_unit *io = log->current_io;
+	struct r5l_meta_block *block;
+	u32 crc;
+
+	if (!io)
+		return;
+
+	block = page_address(io->meta_page);
+	block->meta_size = cpu_to_le32(io->meta_offset);
+	crc = r5l_calculate_checksum(log, log->uuid_checksum,
+		block, log->block_size);
+	block->checksum = cpu_to_le32(crc);
+
+	log->current_io = NULL;
+	r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+
+	r5l_submit_io(log, io);
+}
+
+static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
+{
+	struct r5l_io_unit *io;
+	struct r5l_meta_block *block;
+	struct bio *bio;
+
+	io = r5l_alloc_io_unit(log);
+
+	block = page_address(io->meta_page);
+	block->magic = cpu_to_le32(R5LOG_MAGIC);
+	block->version = R5LOG_VERSION;
+	block->block_size = cpu_to_le16(log->block_size);
+	block->seq = cpu_to_le64(log->seq);
+	block->position = cpu_to_le64(log->log_start);
+
+	io->log_start = log->log_start;
+	io->meta_offset = sizeof(struct r5l_meta_block);
+	io->seq = log->seq;
+
+	bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL,
+		bio_get_nr_vecs(r5l_bdev(log)));
+	io->current_bio = bio;
+	bio->bi_rw = WRITE;
+	bio->bi_bdev = r5l_bdev(log);
+	bio->bi_iter.bi_sector = r5l_block_to_sector(log, log->log_start);
+	bio_add_page(bio, io->meta_page, log->block_size, 0);
+	bio->bi_end_io = r5l_log_endio;
+	bio->bi_private = io;
+
+	bio_list_add(&io->bios, bio);
+	atomic_inc(&io->pending_io);
+
+	log->seq++;
+	log->log_start = r5l_ring_add(log, log->log_start, 1);
+	io->log_end = log->log_start;
+	/* current bio hit disk end */
+	if (log->log_start == log->first_block)
+		io->current_bio = NULL;
+
+	spin_lock(&log->io_list_lock);
+	list_add_tail(&io->log_sibling, &log->running_ios);
+	spin_unlock(&log->io_list_lock);
+
+	return io;
+}
+
+static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
+{
+	struct r5l_io_unit *io;
+
+	io = log->current_io;
+	if (io && io->meta_offset + payload_size > log->block_size)
+		r5l_submit_current_io(log);
+	io = log->current_io;
+	if (io)
+		return 0;
+
+	log->current_io = r5l_new_meta(log);
+	return 0;
+}
+
+static void r5l_log_pages(struct r5l_log *log, u16 type, sector_t location,
+	struct page *page1, u32 checksum1,
+	struct page *page2, u32 checksum2)
+{
+	struct r5l_io_unit *io = log->current_io;
+	struct r5l_payload_data_parity *payload;
+
+	payload = page_address(io->meta_page) + io->meta_offset;
+	payload->header.type = cpu_to_le16(type);
+	payload->header.flags = cpu_to_le16(0);
+	payload->blocks = cpu_to_le32(r5l_page_blocks(log, 1 + !!page2));
+	payload->location = cpu_to_le64(location);
+	payload->checksum[0] = cpu_to_le32(checksum1);
+	if (page2)
+		payload->checksum[1] = cpu_to_le32(checksum2);
+
+alloc_bio:
+	if (!io->current_bio) {
+		struct bio *bio;
+		bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL,
+			bio_get_nr_vecs(r5l_bdev(log)));
+		bio->bi_rw = WRITE;
+		bio->bi_bdev = r5l_bdev(log);
+		bio->bi_iter.bi_sector = r5l_block_to_sector(log, log->log_start);
+		bio->bi_end_io = r5l_log_endio;
+		bio->bi_private = io;
+		bio_list_add(&io->bios, bio);
+		atomic_inc(&io->pending_io);
+		io->current_bio = bio;
+	}
+	if (page1) {
+		if (!bio_add_page(io->current_bio, page1, PAGE_SIZE, 0)) {
+			io->current_bio = NULL;
+			goto alloc_bio;
+		}
+		log->log_start = r5l_ring_add(log, log->log_start,
+			r5l_page_blocks(log, 1));
+		/* current bio hit disk end */
+		if (log->log_start == log->first_block)
+			io->current_bio = NULL;
+	}
+
+	page1 = NULL;
+	if (page2) {
+		if (io->current_bio == NULL)
+			goto alloc_bio;
+		if (!bio_add_page(io->current_bio, page2, PAGE_SIZE, 0)) {
+			io->current_bio = NULL;
+			goto alloc_bio;
+		}
+		log->log_start = r5l_ring_add(log, log->log_start,
+			r5l_page_blocks(log, 1));
+		/* current bio hit disk end */
+		if (log->log_start == log->first_block)
+			io->current_bio = NULL;
+	}
+
+	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
+		sizeof(__le32) * (1 + !!page2);
+	io->log_end = log->log_start;
+}
+
+static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	int i;
+	int meta_size;
+	int write_disks = 0;
+	int data_pages, parity_pages;
+	struct r5l_io_unit *io;
+	int reserve;
+
+	for (i = 0; i < sh->disks; i++) {
+		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+			continue;
+		write_disks++;
+	}
+	parity_pages = 1 + !!(sh->qd_idx >= 0);
+	data_pages = write_disks - parity_pages;
+
+	meta_size = (sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
+		data_pages + sizeof(struct r5l_payload_data_parity) +
+		sizeof(__le32) * parity_pages;
+
+	/* meta + data */
+	reserve = 1 + r5l_page_blocks(log, write_disks);
+	r5l_get_reserve(log, reserve);
+
+	r5l_get_meta(log, meta_size);
+	io = log->current_io;
+
+	for (i = 0; i < sh->disks; i++) {
+		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+			continue;
+		if (i == sh->pd_idx || i == sh->qd_idx)
+			continue;
+		r5l_log_pages(log, R5LOG_PAYLOAD_DATA,
+			compute_blocknr(sh, i, 0),
+			sh->dev[i].page, sh->dev[i].log_checksum,
+			NULL, 0);
+	}
+	r5l_log_pages(log, R5LOG_PAYLOAD_PARITY,
+		sh->sector, sh->dev[sh->pd_idx].page,
+		sh->dev[sh->pd_idx].log_checksum,
+		sh->qd_idx >= 0 ? sh->dev[sh->qd_idx].page : NULL,
+		sh->qd_idx >= 0 ? sh->dev[sh->qd_idx].log_checksum : 0);
+
+	list_add_tail(&sh->log_list, &io->stripe_list);
+	atomic_inc(&io->pending_stripe);
+	sh->log_io = io;
+
+	r5l_put_reserve(log, reserve);
+}
+
+static void r5l_log_thread(struct md_thread *thread)
+{
+	struct mddev *mddev = thread->mddev;
+	struct r5conf *conf = mddev->private;
+	struct r5l_log *log = conf->log;
+	struct stripe_head *sh;
+	LIST_HEAD(list);
+	struct blk_plug plug;
+
+	if (!log)
+		return;
+
+	spin_lock(&log->log_stripes_lock);
+	list_splice_init(&log->log_stripes, &list);
+	spin_unlock(&log->log_stripes_lock);
+
+	if (list_empty(&list))
+		return;
+	mutex_lock(&log->io_mutex);
+	blk_start_plug(&plug);
+	while (!list_empty(&list)) {
+		sh = list_first_entry(&list, struct stripe_head, log_list);
+		list_del_init(&sh->log_list);
+		r5l_log_stripe(log, sh);
+	}
+	r5l_submit_current_io(log);
+	blk_finish_plug(&plug);
+	mutex_unlock(&log->io_mutex);
+}
+
+/*
+ * running in raid5d, where reclaim could wait for raid5d too (when it flushes
+ * data from log to raid disks), so we shouldn't wait for reclaim here
+ * */
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	int write_disks = 0;
+	int data_pages, parity_pages;
+	int meta_size;
+	int i;
+
+	if (!log)
+		return -EAGAIN;
+	/* Don't support stripe batch */
+	if (sh->log_io ||!test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+	    test_bit(STRIPE_SYNCING, &sh->state))
+		return -EAGAIN;
+
+	for (i = 0; i < sh->disks; i++) {
+		void *addr;
+		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
+			continue;
+		write_disks++;
+		addr = kmap_atomic(sh->dev[i].page);
+		sh->dev[i].log_checksum = r5l_calculate_checksum(log,
+			log->uuid_checksum, addr, PAGE_SIZE);
+		kunmap_atomic(addr);
+	}
+	parity_pages = 1 + !!(sh->qd_idx >= 0);
+	data_pages = write_disks - parity_pages;
+
+	meta_size = (sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
+		data_pages + sizeof(struct r5l_payload_data_parity) +
+		sizeof(__le32) * parity_pages;
+	/* Doesn't work with very big raid array */
+	if (meta_size + sizeof(struct r5l_meta_block) >
+			log->block_size)
+		return -EINVAL;
+
+	atomic_inc(&sh->count);
+
+	spin_lock(&log->log_stripes_lock);
+	list_add_tail(&sh->log_list, &log->log_stripes);
+	spin_unlock(&log->log_stripes_lock);
+	return 0;
+}
+
+void r5l_write_stripe_run(struct r5l_log *log)
+{
+	if (!log)
+		return;
+	md_wakeup_thread(log->log_thread);
+}
+
+static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space)
+{
+}
+
+static int r5l_recovery_log(struct r5l_log *log)
+{
+	/* fake recovery */
+	log->seq = log->last_cp_seq + 1;
+	log->log_start = r5l_ring_add(log, log->last_checkpoint, 1);
+	return 0;
+}
+
+static void r5l_write_super(struct r5l_log *log, sector_t cp)
+{
+	log->rdev->recovery_offset = cp;
+	md_update_sb(log->mddev, 1);
+}
+
+static int r5l_load_log(struct r5l_log *log)
+{
+	struct md_rdev *rdev = log->rdev;
+	struct page *page;
+	struct r5l_meta_block *mb;
+	sector_t cp = log->rdev->recovery_offset;
+	u32 stored_crc, expected_crc;
+	bool create_super = false;
+	int ret;
+
+	/* Make sure it's valid */
+	if (cp >= rdev->sectors)
+		cp = 0;
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
+		ret = -EIO;
+		goto ioerr;
+	}
+	mb = page_address(page);
+
+	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
+		mb->version != R5LOG_VERSION ||
+		le16_to_cpu(mb->block_size) > PAGE_SIZE) {
+		create_super = true;
+		goto create;
+	}
+	stored_crc = le32_to_cpu(mb->checksum);
+	mb->checksum = 0;
+	expected_crc = r5l_calculate_checksum(log, log->uuid_checksum,
+		mb, le16_to_cpu(mb->block_size));
+	if (stored_crc != expected_crc) {
+		create_super = true;
+		goto create;
+	}
+	if (le64_to_cpu(mb->position) * (le16_to_cpu(mb->block_size) >> 9) !=
+		cp) {
+		create_super = true;
+		goto create;
+	}
+create:
+	if (create_super) {
+		log->block_size = PAGE_SIZE;
+		log->last_cp_seq = prandom_u32();
+		cp = (cp >> PAGE_SECTOR_SHIFT) << PAGE_SECTOR_SHIFT;
+		/* Make sure super points to correct address */
+		r5l_write_super(log, cp);
+	} else {
+		log->block_size = le16_to_cpu(mb->block_size);
+		log->last_cp_seq = le64_to_cpu(mb->seq);
+	}
+	log->block_sector_shift = ilog2(log->block_size >> 9);
+	log->page_block_shift = PAGE_SHIFT - ilog2(log->block_size);
+
+	log->first_block = 0;
+	log->total_blocks = r5l_sector_to_block(log, rdev->sectors);
+	log->last_block = log->first_block + log->total_blocks;
+	log->last_checkpoint = r5l_sector_to_block(log, cp);
+
+	__free_page(page);
+
+	return r5l_recovery_log(log);
+ioerr:
+	__free_page(page);
+	return ret;
+}
+
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+{
+	struct r5l_log *log;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+	log->mddev = rdev->mddev;
+	log->rdev = rdev;
+
+	log->uuid_checksum = r5l_calculate_checksum(log, ~0, rdev->mddev->uuid,
+		sizeof(rdev->mddev->uuid));
+
+	init_waitqueue_head(&log->space_waitq);
+	mutex_init(&log->io_mutex);
+
+	spin_lock_init(&log->io_list_lock);
+	INIT_LIST_HEAD(&log->running_ios);
+
+	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
+	if (!log->io_kc)
+		goto io_kc;
+
+	INIT_LIST_HEAD(&log->log_stripes);
+	spin_lock_init(&log->log_stripes_lock);
+	log->log_thread = md_register_thread(r5l_log_thread,
+		log->mddev, "log");
+	if (!log->log_thread)
+		goto log_thread;
+
+	if (r5l_load_log(log))
+		goto error;
+
+	conf->log = log;
+	return 0;
+error:
+	md_unregister_thread(&log->log_thread);
+log_thread:
+	kmem_cache_destroy(log->io_kc);
+io_kc:
+	kfree(log);
+	return -EINVAL;
+}
+
+void r5l_exit_log(struct r5l_log *log)
+{
+	md_unregister_thread(&log->log_thread);
+
+	kmem_cache_destroy(log->io_kc);
+	kfree(log);
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59e44e9..9608a44 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -899,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
+	if (!r5l_write_stripe(conf->log, sh))
+		return;
 	for (i = disks; i--; ) {
 		int rw;
 		int replace_only = 0;
@@ -2478,8 +2480,6 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		release_stripe(sh->batch_head);
 }
 
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 {
 	struct r5dev *dev = &sh->dev[i];
@@ -2729,7 +2729,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
 	return new_sector;
 }
 
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
+sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 {
 	struct r5conf *conf = sh->raid_conf;
 	int raid_disks = sh->disks;
@@ -3498,6 +3498,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 			WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
 			WARN_ON(dev->page != dev->orig_page);
 		}
+
 	if (!discard_pending &&
 	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
 		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
@@ -5746,6 +5747,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
 
 	for (i = 0; i < batch_size; i++)
 		handle_stripe(batch[i]);
+	r5l_write_stripe_run(conf->log);
 
 	cond_resched();
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 02c3bf8..a8daf39 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -223,6 +223,9 @@ struct stripe_head {
 	struct stripe_head	*batch_head; /* protected by stripe lock */
 	spinlock_t		batch_lock; /* only header's lock is useful */
 	struct list_head	batch_list; /* protected by head's batch lock*/
+
+	struct r5l_io_unit	*log_io;
+	struct list_head	log_list;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -244,6 +247,7 @@ struct stripe_head {
 		struct bio	*toread, *read, *towrite, *written;
 		sector_t	sector;			/* sector of this page */
 		unsigned long	flags;
+		u32		log_checksum;
 	} dev[1]; /* allocated with extra space depending of RAID geometry */
 };
 
@@ -539,6 +543,7 @@ struct r5conf {
 	struct r5worker_group	*worker_groups;
 	int			group_cnt;
 	int			worker_cnt_per_group;
+	struct r5l_log		*log;
 };
 
 
@@ -605,4 +610,9 @@ static inline int algorithm_is_DDF(int layout)
 
 extern void md_raid5_kick_device(struct r5conf *conf);
 extern int raid5_set_cache_size(struct mddev *mddev, int size);
+extern sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
 #endif
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 8c8e12c..418b1ba 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -315,4 +315,52 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_WRITE_CACHE		\
 					)
 
+struct r5l_payload_header {
+	__le16 type;
+	__le16 flags;
+} __attribute__ ((__packed__));
+
+enum {
+	R5LOG_PAYLOAD_DATA = 0,
+	R5LOG_PAYLOAD_PARITY = 1,
+	R5LOG_PAYLOAD_FLUSH = 2,
+};
+
+struct r5l_payload_data_parity {
+	struct r5l_payload_header header;
+	__le32 blocks; /* block. data/parity size. each 4k has a checksum */
+	__le64 location; /* sector. For data, it's raid sector. For
+				parity, it's stripe sector */
+	__le32 checksum[];
+} __attribute__ ((__packed__));
+
+enum {
+	R5LOG_PAYLOAD_FLAG_DISCARD = 1,
+};
+
+struct r5l_payload_flush {
+	struct r5l_payload_header header;
+	__le32 size; /* flush_stripes size, bytes */
+	__le64 flush_stripes[];
+} __attribute__ ((__packed__));
+
+enum {
+	R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
+};
+
+struct r5l_meta_block {
+	__le32 magic;
+	__le32 checksum;
+	__u8 version;
+	__u8 __zero_pading;
+	__le16 block_size; /* 512B - 4k */
+	__le32 meta_size; /* whole size of the block */
+
+	__le64 seq;
+	__le64 position; /* block, start from rdev->data_offset, current position */
+	struct r5l_payload_header payloads[];
+} __attribute__ ((__packed__));
+
+#define R5LOG_VERSION 0x1
+#define R5LOG_MAGIC 0x6433c509
 #endif
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 4/9] raid5: log reclaim support
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
                   ` (2 preceding siblings ...)
  2015-07-30  0:38 ` [PATCH 3/9] raid5: add basic stripe log Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-08-05  3:43   ` NeilBrown
  2015-08-05  3:52   ` NeilBrown
  2015-07-30  0:38 ` [PATCH 5/9] raid5: log recovery Shaohua Li
                   ` (4 subsequent siblings)
  8 siblings, 2 replies; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

This is the reclaim support for raid5 log. A stripe write will have
following steps:

1. reconstruct the stripe, read data/calculate parity. ops_run_io
prepares to write data/parity to raid disks
2. hijack ops_run_io. stripe data/parity is appending to log disk
3. flush log disk cache
4. ops_run_io run again and do normal operation. stripe data/parity is
written in raid array disks. raid core can return io to upper layer.
5. flush cache of all raid array disks
6. update super block
7. log disk space used by the stripe can be reused

In practice, several stripes consist of an io_unit and we will batch
several io_unit in different steps, but the whole process doesn't
change.

It's possible io return just after data/parity hit log disk, but then
read IO will need read from log disk. For simplicity, IO return happens
at step 4, where read IO can directly read from raid disks.

Currently reclaim run every minute or out of space. Reclaim is just to
free log disk spaces, it doesn't impact data consistency.

Recovery make sure raid disks and log disk have the same data of a
stripe. If crash happens before 4, recovery might/might not recovery
stripe's data/parity depending on if data/parity and its checksum
matches. In either case, this doesn't change the syntax of an IO write.
After step 3, stripe is guaranteed recoverable, because stripe's
data/parity is persistent in log disk. In some cases, log disk content
and raid disks content of a stripe are the same, but recovery will still
copy log disk content to raid disks, this doesn't impact data
consistency. space reuse happens after superblock update and cache
flush.

There is one situation we want to avoid. A broken meta in the middle of
a log causes recovery can't find meta at the head of log. If operations
require meta at the head persistent in log, we must make sure meta
before it persistent in log too. The case is stripe data/parity is in
log and we start write stripe to raid disks (before step 4). stripe
data/parity must be persistent in log before we do the write to raid
disks. The solution is we restrictly maintain io_unit list order. In
this case, we only write stripes of an io_unit to raid disks till the
io_unit is the first one whose data/parity is in log.

The io_unit list order is important for other cases too. For example,
some io_unit are reclaimable and others not. They can be mixed in the
list, we shouldn't reuse space of an unreclaimable io_unit.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 261 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       |   8 +-
 drivers/md/raid5.h       |   3 +
 3 files changed, 271 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5c9bab6..a418e45 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -25,6 +25,7 @@
 #include "raid5.h"
 
 typedef u64 r5blk_t; /* log blocks, 512B - 4k */
+#define RECLAIM_TIMEOUT (60 * HZ) /* reclaim run every 60s */
 
 struct r5l_log {
 	struct mddev *mddev;
@@ -54,9 +55,14 @@ struct r5l_log {
 
 	spinlock_t io_list_lock;
 	struct list_head running_ios; /* running io_unit list */
+	struct list_head io_end_ios; /* io end io_unit list */
+	struct list_head stripe_end_ios; /* stripe end io_unit list */
 
 	struct kmem_cache *io_kc;
 
+	struct md_thread *reclaim_thread;
+	r5blk_t reclaim_target; /* 0 means reclaiming possible io_unit */
+
 	struct md_thread *log_thread;
 	struct list_head log_stripes;
 	spinlock_t log_stripes_lock;
@@ -537,8 +543,246 @@ void r5l_write_stripe_run(struct r5l_log *log)
 	md_wakeup_thread(log->log_thread);
 }
 
+void r5l_stripe_write_finished(struct stripe_head *sh)
+{
+	struct r5l_io_unit *io;
+
+	/* Don't support stripe batch */
+	io = sh->log_io;
+	if (!io)
+		return;
+	sh->log_io = NULL;
+
+	if (!atomic_dec_and_test(&io->pending_stripe))
+		return;
+	r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
+}
+
+static void r5l_compress_stripe_end_list(struct r5l_log *log)
+{
+	struct r5l_io_unit *first, *last, *io;
+
+	if (list_empty(&log->stripe_end_ios))
+		return;
+	first = list_first_entry(&log->stripe_end_ios,
+		struct r5l_io_unit, log_sibling);
+	last = list_last_entry(&log->stripe_end_ios,
+		struct r5l_io_unit, log_sibling);
+	/* Keep 2 io_unit in the list, superblock points to the last one */
+	if (first == last)
+		return;
+	list_del(&first->log_sibling);
+	list_del(&last->log_sibling);
+	while (!list_empty(&log->stripe_end_ios)) {
+		io = list_first_entry(&log->stripe_end_ios,
+			struct r5l_io_unit, log_sibling);
+		list_del(&io->log_sibling);
+		first->log_end = io->log_end;
+		r5l_free_io_unit(log, io);
+	}
+	list_add_tail(&first->log_sibling, &log->stripe_end_ios);
+	list_add_tail(&last->log_sibling, &log->stripe_end_ios);
+}
+
+static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
+	int state)
+{
+	struct r5l_io_unit *io;
+
+	while (!list_empty(from)) {
+		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
+		/* don't change list order */
+		if (io->state >= state)
+			list_move_tail(&io->log_sibling, to);
+		else
+			break;
+	}
+}
+
+/*
+ * Starting dispatch IO to raid.
+ * io_unit(meta) consists of a log. There is one situation we want to avoid. A
+ * broken meta in the middle of a log causes recovery can't find meta at the
+ * head of log. If operations require meta at the head persistent in log, we
+ * must make sure meta before it persistent in log too. A case is:
+ *
+ * stripe data/parity is in log, we start write stripe to raid disks. stripe
+ * data/parity must be persistent in log before we do the write to raid disks.
+ *
+ * The solution is we restrictly maintain io_unit list order. In this case, we
+ * only write stripes of an io_unit to raid disks till the io_unit is the first
+ * one whose data/parity is in log.
+ * */
+void r5l_flush_stripe_to_raid(struct r5l_log *log)
+{
+	struct r5l_io_unit *io;
+	struct stripe_head *sh;
+	bool run_stripe;
+
+	if (!log)
+		return;
+	/* find io_unit with io end but stripes are not running */
+	spin_lock(&log->io_list_lock);
+	r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
+			IO_UNIT_IO_END);
+	r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
+			IO_UNIT_STRIPE_END);
+	r5l_compress_stripe_end_list(log);
+	run_stripe = !list_empty(&log->io_end_ios);
+	spin_unlock(&log->io_list_lock);
+
+	if (!run_stripe)
+		return;
+
+	blkdev_issue_flush(r5l_bdev(log), GFP_NOIO, NULL);
+
+	spin_lock(&log->io_list_lock);
+	list_for_each_entry(io, &log->io_end_ios, log_sibling) {
+		if (io->state >= IO_UNIT_STRIPE_START)
+			continue;
+		r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START);
+
+		while (!list_empty(&io->stripe_list)) {
+			sh = list_first_entry(&io->stripe_list,
+				struct stripe_head, log_list);
+			list_del_init(&sh->log_list);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			release_stripe(sh);
+		}
+	}
+	spin_unlock(&log->io_list_lock);
+}
+
+static void r5l_disks_flush_end(struct bio *bio, int err)
+{
+	struct completion *io_complete = bio->bi_private;
+
+	complete(io_complete);
+	bio_put(bio);
+}
+
+static void r5l_flush_all_disks(struct r5l_log *log)
+{
+	struct mddev *mddev = log->mddev;
+	struct bio *bi;
+	DECLARE_COMPLETION_ONSTACK(io_complete);
+
+	bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
+	bi->bi_end_io = r5l_disks_flush_end;
+	bi->bi_private = &io_complete;
+
+	/* If bio hasn't payload, this function will just flush all disks */
+	md_flush_request(mddev, bi);
+
+	wait_for_completion_io(&io_complete);
+}
+
+static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
+{
+	/* the log thread will log the io unit */
+	r5l_wait_io_unit_state(io, IO_UNIT_IO_END);
+	if (io->state < IO_UNIT_STRIPE_START)
+		r5l_flush_stripe_to_raid(log);
+	r5l_wait_io_unit_state(io, IO_UNIT_STRIPE_END);
+}
+
+static void r5l_write_super(struct r5l_log *log, sector_t cp);
+static void r5l_do_reclaim(struct r5l_log *log)
+{
+	struct r5l_io_unit *io, *last;
+	LIST_HEAD(list);
+	r5blk_t free = 0;
+	r5blk_t reclaim_target = xchg(&log->reclaim_target, 0);
+
+	spin_lock(&log->io_list_lock);
+	/*
+	 * move proper io_unit to reclaim list. We should not change the order.
+	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
+	 * shouldn't reuse space of an unreclaimable io_unit
+	 * */
+	while (1) {
+		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
+			IO_UNIT_IO_END);
+		r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
+				IO_UNIT_STRIPE_END);
+		while (!list_empty(&log->stripe_end_ios)) {
+			io = list_first_entry(&log->stripe_end_ios,
+				struct r5l_io_unit, log_sibling);
+			list_move_tail(&io->log_sibling, &list);
+			free += (io->log_end - io->log_start +
+				log->total_blocks) % log->total_blocks;
+		}
+
+		if (free >= reclaim_target || (list_empty(&log->running_ios) &&
+		    list_empty(&log->io_end_ios) &&
+		    list_empty(&log->stripe_end_ios)))
+			break;
+
+		if (!list_empty(&log->io_end_ios)) {
+			io = list_first_entry(&log->io_end_ios,
+				struct r5l_io_unit, log_sibling);
+			spin_unlock(&log->io_list_lock);
+			/* nobody else can delete the io, we are safe */
+			r5l_kick_io_unit(log, io);
+			spin_lock(&log->io_list_lock);
+			continue;
+		}
+
+		if (!list_empty(&log->running_ios)) {
+			io = list_first_entry(&log->running_ios,
+				struct r5l_io_unit, log_sibling);
+			spin_unlock(&log->io_list_lock);
+			/* nobody else can delete the io, we are safe */
+			r5l_kick_io_unit(log, io);
+			spin_lock(&log->io_list_lock);
+			continue;
+		}
+	}
+	spin_unlock(&log->io_list_lock);
+
+	if (list_empty(&list))
+		return;
+
+	r5l_flush_all_disks(log);
+
+	/* super always point to last valid meta */
+	last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
+	r5l_write_super(log, r5l_block_to_sector(log, last->log_start));
+
+	mutex_lock(&log->io_mutex);
+	log->last_checkpoint = last->log_start;
+	log->last_cp_seq = last->seq;
+	mutex_unlock(&log->io_mutex);
+	wake_up(&log->space_waitq);
+
+	while (!list_empty(&list)) {
+		io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
+		list_del(&io->log_sibling);
+		r5l_free_io_unit(log, io);
+	}
+}
+
+static void r5l_reclaim_thread(struct md_thread *thread)
+{
+	struct mddev *mddev = thread->mddev;
+	struct r5conf *conf = mddev->private;
+	struct r5l_log *log = conf->log;
+
+	if (!log)
+		return;
+	r5l_do_reclaim(log);
+}
+
 static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space)
 {
+	r5blk_t target;
+
+	do {
+		target = log->reclaim_target;
+		if (space < target)
+			return;
+	} while (cmpxchg(&log->reclaim_target, target, space) != target);
+	md_wakeup_thread(log->reclaim_thread);
 }
 
 static int r5l_recovery_log(struct r5l_log *log)
@@ -642,11 +886,19 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	spin_lock_init(&log->io_list_lock);
 	INIT_LIST_HEAD(&log->running_ios);
+	INIT_LIST_HEAD(&log->io_end_ios);
+	INIT_LIST_HEAD(&log->stripe_end_ios);
 
 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
+		log->mddev, "reclaim");
+	if (!log->reclaim_thread)
+		goto reclaim_thread;
+	log->reclaim_thread->timeout = RECLAIM_TIMEOUT;
+
 	INIT_LIST_HEAD(&log->log_stripes);
 	spin_lock_init(&log->log_stripes_lock);
 	log->log_thread = md_register_thread(r5l_log_thread,
@@ -662,6 +914,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 error:
 	md_unregister_thread(&log->log_thread);
 log_thread:
+	md_unregister_thread(&log->reclaim_thread);
+reclaim_thread:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);
@@ -670,6 +924,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 void r5l_exit_log(struct r5l_log *log)
 {
+	/*
+	 * at this point all stripes are finished, so io_unit is at least in
+	 * STRIPE_END state
+	 * */
+	r5l_wake_reclaim(log, -1L);
+	md_unregister_thread(&log->reclaim_thread);
+	r5l_do_reclaim(log);
 	md_unregister_thread(&log->log_thread);
 
 	kmem_cache_destroy(log->io_kc);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9608a44..d1ddd31 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -417,7 +417,7 @@ static int release_stripe_list(struct r5conf *conf,
 	return count;
 }
 
-static void release_stripe(struct stripe_head *sh)
+void release_stripe(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 	unsigned long flags;
@@ -3101,6 +3101,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		if (bi)
 			bitmap_end = 1;
 
+		r5l_stripe_write_finished(sh);
+
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
 
@@ -3499,6 +3501,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 			WARN_ON(dev->page != dev->orig_page);
 		}
 
+	r5l_stripe_write_finished(sh);
+
 	if (!discard_pending &&
 	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
 		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
@@ -5867,6 +5871,8 @@ static void raid5d(struct md_thread *thread)
 		set_bit(R5_DID_ALLOC, &conf->cache_state);
 	}
 
+	r5l_flush_stripe_to_raid(conf->log);
+
 	async_tx_issue_pending_all();
 	blk_finish_plug(&plug);
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a8daf39..23cc9c3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -611,8 +611,11 @@ static inline int algorithm_is_DDF(int layout)
 extern void md_raid5_kick_device(struct r5conf *conf);
 extern int raid5_set_cache_size(struct mddev *mddev, int size);
 extern sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
+extern void release_stripe(struct stripe_head *sh);
 extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
 extern void r5l_exit_log(struct r5l_log *log);
 extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
 extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
 #endif
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 5/9] raid5: log recovery
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
                   ` (3 preceding siblings ...)
  2015-07-30  0:38 ` [PATCH 4/9] raid5: log reclaim support Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-08-05  4:05   ` NeilBrown
  2015-07-30  0:38 ` [PATCH 6/9] raid5: disable batch with log enabled Shaohua Li
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

This is the log recovery support. The process is quite straightforward.
We scan the log and read all valid meta/data/parity into memory. If a
stripe's data/parity checksum is correct, the stripe will be recoveried.
Otherwise, it's discarded and we don't scan the log further. The reclaim
process guarantees stripe which starts to be flushed raid disks has
completed data/parity and has correct checksum. To recovery a stripe, we
just copy its data/parity to corresponding raid disks.

The trick thing is superblock update after recovery. we can't let
superblock point to last valid meta block. The log might look like:
| meta 1| meta 2| meta 3|
meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If superblock
points to meta 1, we write a new valid meta 2n.  If crash happens again,
new recovery will start from meta 1. Since meta 2n is valid, recovery
will think meta 3 is valid, which is wrong.  The solution is we create a
new meta in meta2 with its seq == meta 1's seq + 2 and let superblock
points to meta2.  recovery will not think meta 3 is a valid meta,
because its seq is wrong

TODO:
-recovery should run the stripe cache state machine in case of disk
breakage.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/raid5.c       |   4 +-
 drivers/md/raid5.h       |   6 +
 3 files changed, 315 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index a418e45..17dab66 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -785,11 +785,315 @@ static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space)
 	md_wakeup_thread(log->reclaim_thread);
 }
 
+struct r5l_recovery_ctx {
+	struct page *meta_page;
+	unsigned int meta_total_blocks;
+	r5blk_t pos;
+	u64 seq;
+};
+
+static inline sector_t r5l_sector_to_stripe_sector(struct r5l_log *log,
+       sector_t sect)
+{
+	struct r5conf *conf = log->mddev->private;
+	int dd;
+	return raid5_compute_sector(conf, sect, 0, &dd, NULL);
+}
+
+static int r5l_read_meta_block(struct r5l_log *log,
+	struct r5l_recovery_ctx *ctx)
+{
+	struct r5conf *conf = log->mddev->private;
+	struct page *page = ctx->meta_page;
+	struct r5l_meta_block *mb;
+	u32 crc, stored_crc;
+	struct r5l_payload_header *header;
+	int next_type = -1;
+	int last_type = -1;
+	sector_t last_stripe_sector = 0;
+	int offset;
+
+	if (!sync_page_io(log->rdev, r5l_block_to_sector(log, ctx->pos),
+	    log->block_size, page, READ, false))
+		return -EIO;
+
+	mb = page_address(page);
+	stored_crc = le32_to_cpu(mb->checksum);
+	mb->checksum = 0;
+
+	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
+	    le64_to_cpu(mb->seq) != ctx->seq ||
+	    mb->version != R5LOG_VERSION ||
+	    le64_to_cpu(mb->position) != ctx->pos)
+		return -EINVAL;
+
+	crc = r5l_calculate_checksum(log, log->uuid_checksum,
+			mb, log->block_size);
+	if (stored_crc != crc)
+		return -EINVAL;
+
+	if (le32_to_cpu(mb->meta_size) > log->block_size)
+		return -EINVAL;
+
+	ctx->meta_total_blocks = 1;
+	offset = sizeof(struct r5l_meta_block);
+	while (offset < le32_to_cpu(mb->meta_size)) {
+		u16 type;
+		header = page_address(page) + offset;
+		type = le16_to_cpu(header->type);
+
+		if (next_type != -1 && type != next_type)
+			return -EINVAL;
+		if (type == R5LOG_PAYLOAD_DATA) {
+			struct r5l_payload_data_parity *payload;
+
+			payload = (struct r5l_payload_data_parity *)header;
+			if (le32_to_cpu(payload->blocks) != r5l_page_blocks(log, 1))
+				return -EINVAL;
+			if (last_type != -1) {
+				if (r5l_sector_to_stripe_sector(log,
+				    le64_to_cpu(payload->location)) !=
+				    last_stripe_sector)
+					return -EINVAL;
+			} else
+				last_stripe_sector =
+					r5l_sector_to_stripe_sector(log,
+						le64_to_cpu(payload->location));
+
+			ctx->meta_total_blocks += r5l_page_blocks(log, 1);
+			next_type = -1;
+			last_type = type;
+			offset += sizeof(struct r5l_payload_data_parity) +
+				sizeof(__le32);
+		} else if (type == R5LOG_PAYLOAD_PARITY) {
+			struct r5l_payload_data_parity *payload;
+
+			payload = (struct r5l_payload_data_parity *)header;
+			if (last_type == -1)
+				return -EINVAL;
+
+			if (le32_to_cpu(payload->blocks) !=
+			    r5l_page_blocks(log, conf->max_degraded))
+				return -EINVAL;
+			if (le64_to_cpu(payload->location) != last_stripe_sector)
+				return -EINVAL;
+
+			ctx->meta_total_blocks += r5l_page_blocks(log,
+				conf->max_degraded);
+			next_type = R5LOG_PAYLOAD_DATA;
+			last_type = -1;
+			offset += sizeof(struct r5l_payload_data_parity) +
+				sizeof(__le32) * conf->max_degraded;
+		} else
+			return -EINVAL;
+	}
+	if (offset > le32_to_cpu(mb->meta_size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
+	struct r5l_recovery_ctx *ctx, sector_t stripe_sect,
+	int *offset, r5blk_t *log_offset)
+{
+	struct r5conf *conf = log->mddev->private;
+	struct stripe_head *dummy;
+	struct r5l_payload_data_parity *payload;
+	int disk_index;
+
+	dummy = get_active_stripe(conf, stripe_sect, 0, 0, 0);
+	while (1) {
+		payload = page_address(ctx->meta_page) + *offset;
+
+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+			raid5_compute_sector(conf,
+				le64_to_cpu(payload->location), 0,
+				&disk_index, dummy);
+
+			sync_page_io(log->rdev, r5l_block_to_sector(log,
+				*log_offset), PAGE_SIZE,
+				dummy->dev[disk_index].page, READ, false);
+			dummy->dev[disk_index].log_checksum =
+				le32_to_cpu(payload->checksum[0]);
+			set_bit(R5_Wantwrite, &dummy->dev[disk_index].flags);
+		} else {
+			disk_index = dummy->pd_idx;
+			sync_page_io(log->rdev, r5l_block_to_sector(log,
+				*log_offset), PAGE_SIZE,
+				dummy->dev[disk_index].page, READ, false);
+			dummy->dev[disk_index].log_checksum =
+				le32_to_cpu(payload->checksum[0]);
+			set_bit(R5_Wantwrite, &dummy->dev[disk_index].flags);
+
+			if (dummy->qd_idx >= 0) {
+				disk_index = dummy->qd_idx;
+				sync_page_io(log->rdev, r5l_block_to_sector(log,
+					r5l_ring_add(log, *log_offset,
+						r5l_page_blocks(log, 1))),
+					PAGE_SIZE,
+					dummy->dev[disk_index].page,
+					READ, false);
+				dummy->dev[disk_index].log_checksum =
+					le32_to_cpu(payload->checksum[1]);
+				set_bit(R5_Wantwrite,
+					&dummy->dev[disk_index].flags);
+			}
+		}
+
+		*log_offset = r5l_ring_add(log, *log_offset,
+			le32_to_cpu(payload->blocks));
+		*offset += sizeof(struct r5l_payload_data_parity) +
+			sizeof(__le32) * (le32_to_cpu(payload->blocks) >>
+			log->page_block_shift);
+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
+			break;
+	}
+
+	for (disk_index = 0; disk_index < dummy->disks; disk_index++) {
+		void *addr;
+		u32 checksum;
+
+		if (!test_bit(R5_Wantwrite, &dummy->dev[disk_index].flags))
+			continue;
+		addr = kmap_atomic(dummy->dev[disk_index].page);
+		checksum = r5l_calculate_checksum(log,
+			log->uuid_checksum, addr, PAGE_SIZE);
+		kunmap_atomic(addr);
+		if (checksum != dummy->dev[disk_index].log_checksum)
+			goto error;
+	}
+
+	/* FIXME: let raid core to handle the stripe */
+	for (disk_index = 0; disk_index < dummy->disks; disk_index++) {
+		struct md_rdev *rdev, *rrdev;
+		if (!test_and_clear_bit(R5_Wantwrite,
+				&dummy->dev[disk_index].flags))
+			continue;
+
+		rdev = rcu_dereference(conf->disks[disk_index].rdev);
+		sync_page_io(rdev, stripe_sect, PAGE_SIZE,
+			dummy->dev[disk_index].page, WRITE, false);
+		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
+		if (rrdev)
+			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
+				dummy->dev[disk_index].page, WRITE, false);
+	}
+	release_stripe(dummy);
+	return 0;
+
+error:
+	for (disk_index = 0; disk_index < dummy->disks; disk_index++)
+		dummy->dev[disk_index].flags = 0;
+	release_stripe(dummy);
+	return -EINVAL;
+}
+
+static int r5l_recovery_flush_one_meta(struct r5l_log *log,
+	struct r5l_recovery_ctx *ctx)
+{
+	struct r5l_payload_data_parity *payload;
+	struct r5l_meta_block *mb;
+	int offset;
+	r5blk_t log_offset;
+	sector_t stripe_sector;
+
+	mb = page_address(ctx->meta_page);
+	offset = sizeof(struct r5l_meta_block);
+	log_offset = r5l_ring_add(log, ctx->pos, 1);
+
+	while (offset < le32_to_cpu(mb->meta_size)) {
+		payload = (void *)mb + offset;
+		stripe_sector = r5l_sector_to_stripe_sector(log,
+					le64_to_cpu(payload->location));
+		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
+		    &offset, &log_offset))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/* copy data/parity from log to raid disks */
+static void r5l_recovery_flush_log(struct r5l_log *log,
+	struct r5l_recovery_ctx *ctx)
+{
+	while (1) {
+		if (r5l_read_meta_block(log, ctx))
+			return;
+		if (r5l_recovery_flush_one_meta(log, ctx))
+			return;
+		ctx->seq++;
+		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
+	}
+}
+
+static int r5l_log_write_empty_meta_block(struct r5l_log *log, r5blk_t pos,
+	u64 seq)
+{
+	struct page *page;
+	struct r5l_meta_block *mb;
+	u32 crc;
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+	mb = page_address(page);
+	mb->magic = cpu_to_le32(R5LOG_MAGIC);
+	mb->version = R5LOG_VERSION;
+	mb->block_size = cpu_to_le16(log->block_size);
+	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
+	mb->seq = cpu_to_le64(seq);
+	mb->position = cpu_to_le64(pos);
+	crc = r5l_calculate_checksum(log, log->uuid_checksum, mb,
+			log->block_size);
+	mb->checksum = cpu_to_le32(crc);
+
+	if (!sync_page_io(log->rdev, r5l_block_to_sector(log, pos),
+	    log->block_size, page, WRITE_FUA, false)) {
+		__free_page(page);
+		return -EIO;
+	}
+	__free_page(page);
+	return 0;
+}
+
 static int r5l_recovery_log(struct r5l_log *log)
 {
-	/* fake recovery */
-	log->seq = log->last_cp_seq + 1;
-	log->log_start = r5l_ring_add(log, log->last_checkpoint, 1);
+	struct r5l_recovery_ctx ctx;
+
+	ctx.pos = log->last_checkpoint;
+	ctx.seq = log->last_cp_seq;
+	ctx.meta_page = alloc_page(GFP_KERNEL);
+	if (!ctx.meta_page)
+		return -ENOMEM;
+
+	r5l_recovery_flush_log(log, &ctx);
+	__free_page(ctx.meta_page);
+
+	/*
+	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
+	 * log will start here. but we can't let superblock point to last valid
+	 * meta block. The log might looks like:
+	 * | meta 1| meta 2| meta 3|
+	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
+	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
+	 * happens again, new recovery will start from meta 1. Since meta 2n is
+	 * valid now, recovery will think meta 3 is valid, which is wrong.
+	 * The solution is we create a new meta in meta2 with its seq == meta
+	 * 1's seq + 2 and let superblock points to meta2. The same recovery will
+	 * not think meta 3 is a valid meta, because its seq doesn't match
+	 */
+	if (ctx.seq > log->last_cp_seq + 1) {
+		int ret;
+		r5l_flush_all_disks(log);
+
+		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 1);
+		if (ret)
+			return ret;
+		log->seq = ctx.seq + 2;
+		log->log_start = r5l_ring_add(log, ctx.pos, 1);
+		r5l_write_super(log, r5l_block_to_sector(log, ctx.pos));
+	}
 	return 0;
 }
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d1ddd31..77af7f0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -662,7 +662,7 @@ static int has_failed(struct r5conf *conf)
 	return 0;
 }
 
-static struct stripe_head *
+struct stripe_head *
 get_active_stripe(struct r5conf *conf, sector_t sector,
 		  int previous, int noblock, int noquiesce)
 {
@@ -2527,7 +2527,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
  * Input: a 'big' sector number,
  * Output: index of the data and parity disk, and the sector # in them.
  */
-static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
 				     int previous, int *dd_idx,
 				     struct stripe_head *sh)
 {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 23cc9c3..fd10d29 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -618,4 +618,10 @@ extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
 extern void r5l_write_stripe_run(struct r5l_log *log);
 extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
 extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+				     int previous, int *dd_idx,
+				     struct stripe_head *sh);
+extern struct stripe_head *
+get_active_stripe(struct r5conf *conf, sector_t sector,
+		  int previous, int noblock, int noquiesce);
 #endif
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 6/9] raid5: disable batch with log enabled
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
                   ` (4 preceding siblings ...)
  2015-07-30  0:38 ` [PATCH 5/9] raid5: log recovery Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-07-30  0:38 ` [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support Shaohua Li
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

With log enabled, r5l_write_stripe will add the stripe to log. With
batch, several stripes are linked together. The stripes must be in the
same state. While with log, the log/reclaim unit is stripe, we can't
guarantee the several stripes are in the same state. Disabling batch for
log now.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77af7f0..59f9312 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -759,6 +759,9 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 /* Only freshly new full stripe normal write stripe can be added to a batch list */
 static bool stripe_can_batch(struct stripe_head *sh)
 {
+	struct r5conf *conf = sh->raid_conf;
+	if (conf->log)
+		return false;
 	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 		is_full_stripe_write(sh);
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
                   ` (5 preceding siblings ...)
  2015-07-30  0:38 ` [PATCH 6/9] raid5: disable batch with log enabled Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-08-05  4:13   ` NeilBrown
  2015-07-30  0:38 ` [PATCH 8/9] raid5: enable log for raid array with cache disk Shaohua Li
  2015-07-30  0:38 ` [PATCH 9/9] raid5: skip resync if cache(log) is enabled Shaohua Li
  8 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

If cache(log) support is enabled, don't allow resize/reshape in current
stage. In the future, we can flush all data from cache(log) to raid
before resize/reshape and then allow resize/reshape.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59f9312..b694d06 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7184,6 +7184,10 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 	 * worth it.
 	 */
 	sector_t newsize;
+	struct r5conf *conf = mddev->private;
+
+	if (conf->log)
+		return -EINVAL;
 	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
 	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
 	if (mddev->external_size &&
@@ -7235,6 +7239,8 @@ static int check_reshape(struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
 
+	if (conf->log)
+		return -EINVAL;
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
 	    mddev->new_chunk_sectors == mddev->chunk_sectors)
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 8/9] raid5: enable log for raid array with cache disk
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
                   ` (6 preceding siblings ...)
  2015-07-30  0:38 ` [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-07-30  0:38 ` [PATCH 9/9] raid5: skip resync if cache(log) is enabled Shaohua Li
  8 siblings, 0 replies; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

Now log is safe to enable for raid array with cache disk

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b694d06..26ea100 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6309,8 +6309,11 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	if (conf->log)
+		r5l_exit_log(conf->log);
 	if (conf->shrinker.seeks)
 		unregister_shrinker(&conf->shrinker);
+
 	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
@@ -6954,6 +6957,16 @@ static int run(struct mddev *mddev)
 						mddev->queue);
 	}
 
+	rdev_for_each(rdev, mddev) {
+		if (test_bit(WriteCache, &rdev->flags)) {
+			char b[BDEVNAME_SIZE];
+			printk(KERN_INFO"md/raid:%s: using device %s as cache\n",
+				mdname(mddev), bdevname(rdev->bdev, b));
+			r5l_init_log(conf, rdev);
+			break;
+		}
+	}
+
 	return 0;
 abort:
 	md_unregister_thread(&mddev->thread);
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 9/9] raid5: skip resync if cache(log) is enabled
  2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
                   ` (7 preceding siblings ...)
  2015-07-30  0:38 ` [PATCH 8/9] raid5: enable log for raid array with cache disk Shaohua Li
@ 2015-07-30  0:38 ` Shaohua Li
  2015-08-05  4:16   ` NeilBrown
  8 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-07-30  0:38 UTC (permalink / raw)
  To: linux-raid; +Cc: Kernel-team, songliubraving, hch, dan.j.williams, neilb

If cache(log) is enabled, the log structure will guarantee data
consistency, so skip resync for unclean shutdown

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 26ea100..330550a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6967,6 +6967,15 @@ static int run(struct mddev *mddev)
 		}
 	}
 
+	if (conf->log) {
+		if (mddev->recovery_cp == 0) {
+			printk(KERN_NOTICE
+				"md/raid:%s: skip resync with caching enabled\n",
+				mdname(mddev));
+			mddev->recovery_cp = MaxSector;
+		}
+	}
+
 	return 0;
 abort:
 	md_unregister_thread(&mddev->thread);
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/9] MD: add a new disk role to present cache device
  2015-07-30  0:38 ` [PATCH 1/9] MD: add a new disk role to present cache device Shaohua Li
@ 2015-08-04 14:28   ` Christoph Hellwig
  2015-08-04 18:17     ` Song Liu
  2015-08-05  1:05   ` NeilBrown
  1 sibling, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2015-08-04 14:28 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams, neilb

>  		case 0xfffe: /* faulty */
>  			set_bit(Faulty, &rdev->flags);
>  			break;
> +		case 0xfffd: /* cache device */

Any chance to get constants for these magic numbers as an additional
prep patch?

Also I don't really think that adding the role without the actual
implementation is that useful.


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 2/9] md: override md superblock recovery_offset for cache device
  2015-07-30  0:38 ` [PATCH 2/9] md: override md superblock recovery_offset for " Shaohua Li
@ 2015-08-04 14:30   ` Christoph Hellwig
  2015-08-05  1:08   ` NeilBrown
  1 sibling, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2015-08-04 14:30 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams, neilb

On Wed, Jul 29, 2015 at 05:38:42PM -0700, Shaohua Li wrote:
> Cache device stores data in a log structure. We need record the log
> start. Here we override md superblock recovery_offset for this purpose.
> This field of a cache device is meaningless otherwise.

Just reusing the field for a different purpose without a different
name or comment is highly confusing.

I'd suggest to create unions in both the on-disk and in-memory
structures and properly document that new usage for cache devices.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: [PATCH 1/9] MD: add a new disk role to present cache device
  2015-08-04 14:28   ` Christoph Hellwig
@ 2015-08-04 18:17     ` Song Liu
  2015-08-05  0:25       ` NeilBrown
  0 siblings, 1 reply; 30+ messages in thread
From: Song Liu @ 2015-08-04 18:17 UTC (permalink / raw)
  To: Christoph Hellwig, Shaohua Li
  Cc: linux-raid, Kernel Team, dan.j.williams, neilb

> -----Original Message-----
> From: Christoph Hellwig [mailto:hch@infradead.org]
> Sent: Tuesday, August 4, 2015 7:28 AM
> To: Shaohua Li
> Cc: linux-raid@vger.kernel.org; Kernel Team; Song Liu; hch@infradead.org;
> dan.j.williams@intel.com; neilb@suse.de
> Subject: Re: [PATCH 1/9] MD: add a new disk role to present cache device
> 
> >  		case 0xfffe: /* faulty */
> >  			set_bit(Faulty, &rdev->flags);
> >  			break;
> > +		case 0xfffd: /* cache device */
> 
> Any chance to get constants for these magic numbers as an additional prep
> patch?
I will add patch for special roles (spare, faulty, cache, etc.). 


> 
> Also I don't really think that adding the role without the actual implementation
> is that useful.

Currently, we are planning to use 0xfffd for both "cache device" and 
"journal device" (fix write hole only). Would you prefer to separate these
two scenarios with two different roles (0xfffd and 0xfffc)?

Thanks,
Song


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/9] MD: add a new disk role to present cache device
  2015-08-04 18:17     ` Song Liu
@ 2015-08-05  0:25       ` NeilBrown
  0 siblings, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-05  0:25 UTC (permalink / raw)
  To: Song Liu
  Cc: Christoph Hellwig, Shaohua Li, linux-raid, Kernel Team, dan.j.williams

On Tue, 4 Aug 2015 18:17:25 +0000 Song Liu <songliubraving@fb.com>
wrote:

> > -----Original Message-----
> > From: Christoph Hellwig [mailto:hch@infradead.org]
> > Sent: Tuesday, August 4, 2015 7:28 AM
> > To: Shaohua Li
> > Cc: linux-raid@vger.kernel.org; Kernel Team; Song Liu; hch@infradead.org;
> > dan.j.williams@intel.com; neilb@suse.de
> > Subject: Re: [PATCH 1/9] MD: add a new disk role to present cache device
> > 
> > >  		case 0xfffe: /* faulty */
> > >  			set_bit(Faulty, &rdev->flags);
> > >  			break;
> > > +		case 0xfffd: /* cache device */
> > 
> > Any chance to get constants for these magic numbers as an additional prep
> > patch?
> I will add patch for special roles (spare, faulty, cache, etc.). 
> 
> 
> > 
> > Also I don't really think that adding the role without the actual implementation
> > is that useful.
> 
> Currently, we are planning to use 0xfffd for both "cache device" and 
> "journal device" (fix write hole only). Would you prefer to separate these
> two scenarios with two different roles (0xfffd and 0xfffc)?

No, definitely not.  It is a log device.  whether it is being used just
to close the write hole or more aggressively as a write-ahead cache to
reduce latency doesn't change the nature of the data on the device.
So just one new device type: "log" or "journal" or something describing
what is on the device.

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/9] MD: add a new disk role to present cache device
  2015-07-30  0:38 ` [PATCH 1/9] MD: add a new disk role to present cache device Shaohua Li
  2015-08-04 14:28   ` Christoph Hellwig
@ 2015-08-05  1:05   ` NeilBrown
  1 sibling, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-05  1:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:41 -0700 Shaohua Li <shli@fb.com> wrote:

> From: Song Liu <songliubraving@fb.com>
> 
> Next patches will use a disk as raid5/6 caching. We need a new disk role
> to present the cache device and add MD_FEATURE_WRITE_CACHE to
> feature_map for backward compability.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/md.c                | 24 ++++++++++++++++++++++--
>  drivers/md/md.h                |  4 ++++
>  include/uapi/linux/raid/md_p.h |  3 +++
>  3 files changed, 29 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index d429c30..fd84f16 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -1656,6 +1656,16 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
>  		case 0xfffe: /* faulty */
>  			set_bit(Faulty, &rdev->flags);
>  			break;
> +		case 0xfffd: /* cache device */
> +			if (!(sb->feature_map & MD_FEATURE_WRITE_CACHE)) {
> +				/* cache device without cache feature */
> +				printk(KERN_WARNING
> +				  "md: cache device provided without write "
> +				  "cache feature, ignoring the device\n");
> +				return -EINVAL;
> +			}
> +			set_bit(WriteCache, &rdev->flags);
> +			break;
>  		default:
>  			rdev->saved_raid_disk = role;
>  			if ((le32_to_cpu(sb->feature_map) &
> @@ -1811,7 +1821,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
>  			sb->dev_roles[i] = cpu_to_le16(0xfffe);
>  		else if (test_bit(In_sync, &rdev2->flags))
>  			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
> -		else if (rdev2->raid_disk >= 0)
> +		else if (test_bit(WriteCache, &rdev2->flags)) {
> +			sb->dev_roles[i] = cpu_to_le16(0xfffd);
> +			sb->feature_map |= cpu_to_le32(MD_FEATURE_WRITE_CACHE);
> +		} else if (rdev2->raid_disk >= 0)
>  			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
>  		else
>  			sb->dev_roles[i] = cpu_to_le16(0xffff);
> @@ -5803,7 +5816,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
>  		else if (test_bit(In_sync, &rdev->flags)) {
>  			info.state |= (1<<MD_DISK_ACTIVE);
>  			info.state |= (1<<MD_DISK_SYNC);
> -		}
> +		} else if (test_bit(WriteCache, &rdev->flags))
> +			info.state |= (1<<MD_DISK_WRITECACHE);
>  		if (test_bit(WriteMostly, &rdev->flags))
>  			info.state |= (1<<MD_DISK_WRITEMOSTLY);
>  	} else {
> @@ -5918,6 +5932,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
>  		else
>  			clear_bit(WriteMostly, &rdev->flags);
>  
> +		if (info->state & (1<<MD_DISK_WRITECACHE))
> +			set_bit(WriteCache, &rdev->flags);
>  		/*
>  		 * check whether the device shows up in other nodes
>  		 */
> @@ -7286,6 +7302,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
>  				seq_printf(seq, "(F)");
>  				continue;
>  			}
> +			if (test_bit(WriteCache, &rdev->flags)) {
> +				seq_printf(seq, "(C)");
> +				continue;
> +			}
>  			if (rdev->raid_disk < 0)
>  				seq_printf(seq, "(S)"); /* spare */
>  			if (test_bit(Replacement, &rdev->flags))
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 7da6e9c..a9f27db 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -176,6 +176,10 @@ enum flag_bits {
>  				 * This device is seen locally but not
>  				 * by the whole cluster
>  				 */
> +	WriteCache,		/* This device is used as write cache.
> +				 * Usually, this device should be faster
> +				 * than other devices in the array
> +				 */
>  };
>  
>  #define BB_LEN_MASK	(0x00000000000001FFULL)
> diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
> index 2ae6131..8c8e12c 100644
> --- a/include/uapi/linux/raid/md_p.h
> +++ b/include/uapi/linux/raid/md_p.h
> @@ -89,6 +89,7 @@
>  				   * read requests will only be sent here in
>  				   * dire need
>  				   */
> +#define MD_DISK_WRITECACHE      18 /* disk is used as the write cache in RAID-5/6 */
>  
>  typedef struct mdp_device_descriptor_s {
>  	__u32 number;		/* 0 Device number in the entire set	      */
> @@ -302,6 +303,7 @@ struct mdp_superblock_1 {
>  #define	MD_FEATURE_RECOVERY_BITMAP	128 /* recovery that is happening
>  					     * is guided by bitmap.
>  					     */
> +#define	MD_FEATURE_WRITE_CACHE		256 /* support write cache */
>  #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
>  					|MD_FEATURE_RECOVERY_OFFSET	\
>  					|MD_FEATURE_RESHAPE_ACTIVE	\
> @@ -310,6 +312,7 @@ struct mdp_superblock_1 {
>  					|MD_FEATURE_RESHAPE_BACKWARDS	\
>  					|MD_FEATURE_NEW_OFFSET		\
>  					|MD_FEATURE_RECOVERY_BITMAP	\
> +					|MD_FEATURE_WRITE_CACHE		\
>  					)
>  
>  #endif

That last line is technically a bit premature.  Once you add
MD_FEATURE_WRITE_CACHE to MD_FEATURE_ALL, super_1_load will start
accepting metadata with that bit set.  We should really leave that
until the code is really ready to deal with it.

Otherwise this patch is OK.

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 2/9] md: override md superblock recovery_offset for cache device
  2015-07-30  0:38 ` [PATCH 2/9] md: override md superblock recovery_offset for " Shaohua Li
  2015-08-04 14:30   ` Christoph Hellwig
@ 2015-08-05  1:08   ` NeilBrown
  1 sibling, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-05  1:08 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:42 -0700 Shaohua Li <shli@fb.com> wrote:

> Cache device stores data in a log structure. We need record the log
> start. Here we override md superblock recovery_offset for this purpose.
> This field of a cache device is meaningless otherwise.
> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/md.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index fd84f16..9861f34 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -1664,6 +1664,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
>  				  "cache feature, ignoring the device\n");
>  				return -EINVAL;
>  			}
> +			rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
>  			set_bit(WriteCache, &rdev->flags);
>  			break;
>  		default:
> @@ -1830,6 +1831,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
>  			sb->dev_roles[i] = cpu_to_le16(0xffff);
>  	}
>  
> +	if (test_bit(WriteCache, &rdev->flags))
> +		sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
> +

I would much rather this was a little earlier in the function,
near where recovery_offset is set  for non-InSync devices.
i.e. immediately after:

	if (rdev->raid_disk >= 0 &&
	    !test_bit(In_sync, &rdev->flags)) {
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
		sb->recovery_offset =
			cpu_to_le64(rdev->recovery_offset);
		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
			sb->feature_map |=
				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
	}

Maybe we could always copy recovery_offset from rdev to sb, and then
just set whichever feature flags are relevant - if any.  Or maybe not.

Thanks,
NeilBrown


>  	sb->sb_csum = calc_sb_1_csum(sb);
>  }
>  


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/9] raid5: add basic stripe log
  2015-07-30  0:38 ` [PATCH 3/9] raid5: add basic stripe log Shaohua Li
@ 2015-08-05  3:07   ` NeilBrown
  2015-08-05 21:19     ` Shaohua Li
  0 siblings, 1 reply; 30+ messages in thread
From: NeilBrown @ 2015-08-05  3:07 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:43 -0700 Shaohua Li <shli@fb.com> wrote:

> This introduces a simple log for raid5. Data/parity writting to raid
> array first writes to the log, then write to raid array disks. If crash
> happens, we can recovery data from the log. This can speed up raid
> resync and fix write hole issue.
> 
> The log structure is pretty simple. Data/meta data is stored in block
> unit, which is 4k generally. It has only one type of meta data block.
> The meta data block can track 3 types of data, stripe data, stripe
> parity and flush block. MD superblock will point to the last valid meta
> data block. Each meta data block has checksum/seq number, so recovery
> can scan the log correctly. We store a checksum of stripe data/parity to
> the metadata block, so meta data and stripe data/parity can be written
> to log disk together. otherwise, meta data write must wait till stripe
> data/parity is finished.
> 
> For stripe data, meta data block will record stripe data sector and
> size. Currently the size is always 4k. This meta data record can be made
> simpler if we just fix write hole (eg, we can record data of a stripe's
> different disks together), but this format can be extended to support
> caching in the future, which must record data address/size.
> 
> For stripe parity, meta data block will record stripe sector. It's size
> should be 4k (for raid5) or 8k (for raid6). We always store p parity
> first. This format should work for caching too.

It feels a bit odd have a 8K parity block for RAID6 as it is really two
blocks: a parity block and a Q-syndrome block.  What would you think of
introducing another block type for Q?  So there are Data blocks, Parity
blocks, and Q blocks ???

Not a big issue, but I thought I would mention it.

> 
> flush block indicates a stripe is in raid array disks. Fixing write hole
> doesn't need this type of meta data, it's for caching extention.
> 
> We should be careful about deadlock. Appending stripe data/parity to log
> is done in raid5d. The append need log space, which can trigger log
> reclaim, which might wait for raid5d to run (to flush data from log to
> raid array disks). Here we always do the log write in a separate thread.

I'm not convinced about the need for a separate thread.  As
raid5d/handle_stripe works as a state machine, and as all IO is async,
we should at most need an extra state, not an extra thread.

I think a key issue is that you don't call r5l_get_reserve() until you
are about to submit writes to the log.
If instead you reserved the space in r5l_write_stripe and delay the
stripe if space is not available, there there would be no deadlock.
Then when available space crosses some threshold, re-activate those
delayed stripes.


> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/Makefile            |   2 +-
>  drivers/md/raid5-cache.c       | 677 +++++++++++++++++++++++++++++++++++++++++
>  drivers/md/raid5.c             |   8 +-
>  drivers/md/raid5.h             |  10 +
>  include/uapi/linux/raid/md_p.h |  48 +++
>  5 files changed, 741 insertions(+), 4 deletions(-)
>  create mode 100644 drivers/md/raid5-cache.c
> 
> diff --git a/drivers/md/Makefile b/drivers/md/Makefile
> index 462f443..f34979c 100644
> --- a/drivers/md/Makefile
> +++ b/drivers/md/Makefile
> @@ -17,7 +17,7 @@ dm-cache-smq-y   += dm-cache-policy-smq.o
>  dm-cache-cleaner-y += dm-cache-policy-cleaner.o
>  dm-era-y	+= dm-era-target.o
>  md-mod-y	+= md.o bitmap.o
> -raid456-y	+= raid5.o
> +raid456-y	+= raid5.o raid5-cache.o
>  
>  # Note: link order is important.  All raid personalities
>  # and must come before md.o, as they each initialise 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> new file mode 100644
> index 0000000..5c9bab6
> --- /dev/null
> +++ b/drivers/md/raid5-cache.c
> @@ -0,0 +1,677 @@
> +/*
> + * Copyright (C) 2015 Shaohua Li <shli@fb.com>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
> + */
> +#include <linux/kernel.h>
> +#include <linux/wait.h>
> +#include <linux/blkdev.h>
> +#include <linux/slab.h>
> +#include <linux/raid/md_p.h>
> +#include <linux/crc32.h>
> +#include <linux/random.h>
> +#include "md.h"
> +#include "raid5.h"
> +
> +typedef u64 r5blk_t; /* log blocks, 512B - 4k */

I would much rather use sector_t throughout and keep all addresses as
sector addresses.  Much less room for confusion that way.

If we wanted to pack lots of addresses into limited space then using
block addresses might be justified (filesystems do that), but I don't
think it is called for here at all.

> +
> +struct r5l_log {
> +	struct mddev *mddev;
> +	struct md_rdev *rdev;

As the rdev contains a pointer to the mddev, storing the mddev as well
isn't strictly necessary - though it doesn't really hurt.
You don't seem to use it very much....


> +
> +	u32 uuid_checksum;
> +
> +	unsigned int block_size; /* bytes */
> +	unsigned int block_sector_shift;
> +	unsigned int page_block_shift; /* page to block */

These last two probably aren't needed if everything is in sectors.


> +
> +	r5blk_t total_blocks;
> +	r5blk_t first_block;
> +	r5blk_t last_block;
> +
> +	r5blk_t last_checkpoint; /* log tail */
> +	u64 last_cp_seq; /* log tail sequence */
> +
> +	u64 seq; /* current sequence */
> +	r5blk_t log_start; /* current log position */

How is this different from first_block?  Or maybe it is last_block.
It is good that you have some comments here, but a few more wouldn't
hurt.
Also it sometimes helps to say how something is used as well as what
it is.
e.g. is the "current sequence" number the sequence number of the last
metadata block written, or of the next metadata block to be written?


> +
> +	r5blk_t reserved_blocks;
> +	wait_queue_head_t space_waitq;
> +
> +	struct mutex io_mutex;
> +	struct r5l_io_unit *current_io;
> +
> +	spinlock_t io_list_lock;
> +	struct list_head running_ios; /* running io_unit list */

A few more words to make it clear what "running" means would be good.
An io_unit becomes "running" when .... and ceases to be "running"
when .......

> +
> +	struct kmem_cache *io_kc;
> +
> +	struct md_thread *log_thread;
> +	struct list_head log_stripes;
> +	spinlock_t log_stripes_lock;
> +};
> +
> +/*
> + * an IO range starts from a meta data block and end at the next meta data
> + * block. The io unit's the meta data block tracks data/parity followed it. io
> + * unit is written to log disk with normal write, as we always flush log disk
> + * first and then start move data to raid disks, there is no requirement to
> + * write io unit with FLUSH/FUA
> + * */
> +struct r5l_io_unit {
> +	struct r5l_log *log;
> +
> +	struct page *meta_page; /* store meta block */
> +	int meta_offset; /* current offset in meta_page */
> +
> +	struct bio_list bios;
> +	struct bio *current_bio; /* current_bio accepting pages */
> +	atomic_t pending_io; /* pending bios not writting to log */

"now writing" ??  (I often type 'now' and 'not' :-)

Actually it is often "1 more than the number of bios queued but which
have not yet completed".
In this case there is no need for that "one more than" as we first
queue all the bios, then we submit them.
You only need the "one more than" thing if you might keep incrementing
the counter after you start submitting bios.


> +
> +	atomic_t pending_stripe; /* how many stripes not flushed to raid */
> +	u64 seq;
> +	r5blk_t log_start; /* where the io_unit starts */
> +	r5blk_t log_end; /* where the io_unit ends */
> +	struct list_head log_sibling; /* log->running_ios */
> +	struct list_head stripe_list; /* stripes added to the io_unit */
> +	int state;
> +	wait_queue_head_t wait_state;
> +};
> +
> +/* r5l_io_unit state */
> +enum {
> +	IO_UNIT_RUNNING = 0, /* accepting new IO */
> +	IO_UNIT_IO_START = 1, /* io_unit is writting to log */
> +	IO_UNIT_IO_END = 2, /* io_unit is in log */
> +	IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are running */
> +	IO_UNIT_STRIPE_END = 4, /* stripes data are in raid disks */
> +};

0, 1, and 3 tell me what is currently happening to the stripe.
4 tells me that nothing is happening any more.
What does '2' tell me exactly?  What is happening when state is 2?
Is that state needed?


> +
> +#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
> +
> +static inline struct block_device *r5l_bdev(struct r5l_log *log)
> +{
> +	return log->rdev->bdev;
> +}
> +
> +static inline sector_t r5l_block_to_sector(struct r5l_log *log, r5blk_t block)
> +{
> +	return block << log->block_sector_shift;
> +}
> +
> +static inline r5blk_t r5l_sector_to_block(struct r5l_log *log, sector_t s)
> +{
> +	return s >> log->block_sector_shift;
> +}
> +
> +static inline int r5l_page_blocks(struct r5l_log *log, int pages)
> +{
> +	return pages << log->page_block_shift;
> +}
> +
> +static u32 r5l_calculate_checksum(struct r5l_log *log, u32 crc,
> +	void *buf, size_t size)
> +{
> +	return crc32_le(crc, buf, size);
> +}
> +

Tiny little inlines and defines like this worry me.
Do they really add useful abstractions, or do they just hide important
detail?
Putting this sort of thing in a header file and making it part of an
interface is quite different from putting them in a .c file and making
the code harder to read (because I keep having to check exactly what
the function does).

If r5l_calculate_checksum copied out the old checksum, stored a zero
there, performed the calculation, then put the old checksum back, then
that might be a useful separate function.  But as it is there seems no
point.

PAGE_SECTOR_SHIFT is more typing than PAGE_SHIFT - 9 and isn't any more
clear.  And you only use it once where I'm not at all sure that you
should.


> +static r5blk_t r5l_ring_add(struct r5l_log *log, r5blk_t block, int inc)
> +{
> +	block += inc;
> +	if (block >= log->last_block)
> +		block = block - log->total_blocks;
> +	return block;
> +}
> +
> +static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space);
> +static r5blk_t r5l_free_space(struct r5l_log *log)
> +{
> +	r5blk_t used_size;
> +
> +	if (log->log_start >= log->last_checkpoint)
> +		used_size = log->log_start - log->last_checkpoint;
> +	else
> +		used_size = log->log_start + log->total_blocks -
> +			log->last_checkpoint;
> +
> +	if (log->total_blocks > used_size + log->reserved_blocks)
> +		return log->total_blocks - used_size - log->reserved_blocks;
> +	return 0;

Maybe I'm getting confused by terminology, but this looks wrong.
I would think that "last_checkpoint" was something written fairly
recently, and "log_start" was something written a long time ago.
So if the log hasn't wrapped, then the "used_size" would be
   last_checkpoint - log_start
while it if had wrapped, it would be

   last_checkpoint + total_blocks - log_start

???


> +}
> +
> +/* Make sure we have enough free space in log device */
> +static void r5l_get_reserve(struct r5l_log *log, unsigned int size)
> +{
> +	r5blk_t free;
> +
> +	BUG_ON(!mutex_is_locked(&log->io_mutex));
> +	free = r5l_free_space(log);
> +	if (free >= size) {
> +		log->reserved_blocks += size;
> +		return;
> +	}
> +
> +	log->reserved_blocks += size;
> +	mutex_unlock(&log->io_mutex);
> +
> +	r5l_wake_reclaim(log, size);
> +
> +	mutex_lock(&log->io_mutex);
> +	wait_event_cmd(log->space_waitq, r5l_free_space(log) > 0,
> +		mutex_unlock(&log->io_mutex), mutex_lock(&log->io_mutex));
> +}

As I've already said I don't think that blocking when out of space is a
good idea, but couldn't this be much simpler?

 log->reserved_blocks += size;
 wait_event_cmd(log->space_waitq, r5l_free_space(log) > 0,
                mutex_unlock(&log->io_mutex); 
                   r5l_wake_reclaim(log, size),
                mutex_lock(&log->io_mutex));

??

> +
> +static void r5l_put_reserve(struct r5l_log *log, unsigned int reserved_blocks)
> +{
> +	BUG_ON(!mutex_is_locked(&log->io_mutex));
> +
> +	log->reserved_blocks -= reserved_blocks;
> +	if (r5l_free_space(log) > 0)
> +		wake_up(&log->space_waitq);
> +}
> +
> +static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
> +{
> +	struct r5l_io_unit *io;
> +	gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
> +
> +	io = kmem_cache_zalloc(log->io_kc, gfp);
> +	if (!io)
> +		return NULL;
> +	io->log = log;
> +	io->meta_page = alloc_page(gfp | __GFP_ZERO);
> +	if (!io->meta_page) {
> +		kmem_cache_free(log->io_kc, io);
> +		return NULL;
> +	}

This can return NULL, but the one place where you call it you do not
check for NULL.
Maybe a mempool would be appropriate, maybe something else - I haven't
examine the issue closely.


> +
> +	bio_list_init(&io->bios);
> +	atomic_set(&io->pending_io, 1);
> +	INIT_LIST_HEAD(&io->log_sibling);
> +	INIT_LIST_HEAD(&io->stripe_list);
> +	io->state = IO_UNIT_RUNNING;
> +	init_waitqueue_head(&io->wait_state);
> +	return io;
> +}
> +
> +static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
> +{
> +	__free_page(io->meta_page);
> +	kmem_cache_free(log->io_kc, io);
> +}
> +
> +static void r5l_wait_io_unit_state(struct r5l_io_unit *io, int state)
> +{
> +	wait_event(io->wait_state, io->state >= state);
> +}

This is another of those tiny inlines which I think just make the code
harder to read.


> +
> +static void r5l_set_io_unit_state(struct r5l_io_unit *io, int state)
> +{
> +	if (io->state >= state)
> +		return;

Should this ever actually 'return'?
    if (WARN_ON(..)) return; ???
Not sure - I haven't examined closely.


> +	io->state = state;
> +	wake_up(&io->wait_state);
> +}
> +
> +static void r5l_submit_bio(struct r5l_log *log, int rw, struct bio *bio)
> +{
> +	/* all IO must start from rdev->data_offset */
> +	bio->bi_iter.bi_sector += log->rdev->data_offset;
> +	submit_bio(rw, bio);
> +}

This might almost be worthy of its own function if it was called more
than once.  But as it is...


> +
> +static void r5l_io_unit_ioend(struct r5l_io_unit *io, int error)
> +{
> +	struct r5l_log *log = io->log;
> +	if (!atomic_dec_and_test(&io->pending_io))
> +		return;
> +
> +	r5l_set_io_unit_state(io, IO_UNIT_IO_END);
> +	md_wakeup_thread(log->mddev->thread);
> +}
> +
> +static void r5l_log_endio(struct bio *bio, int error)
> +{
> +	struct r5l_io_unit *io = bio->bi_private;
> +
> +	bio_put(bio);
> +	r5l_io_unit_ioend(io, error);
> +}
> +
> +static void r5l_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
> +{
> +	struct bio *bio;
> +	while ((bio = bio_list_pop(&io->bios)))
> +		r5l_submit_bio(log, WRITE, bio);
> +
> +	r5l_io_unit_ioend(io, 0);
> +}
> +
> +static void r5l_submit_current_io(struct r5l_log *log)
> +{
> +	struct r5l_io_unit *io = log->current_io;
> +	struct r5l_meta_block *block;
> +	u32 crc;
> +
> +	if (!io)
> +		return;
> +
> +	block = page_address(io->meta_page);
> +	block->meta_size = cpu_to_le32(io->meta_offset);
> +	crc = r5l_calculate_checksum(log, log->uuid_checksum,
> +		block, log->block_size);
> +	block->checksum = cpu_to_le32(crc);
> +
> +	log->current_io = NULL;
> +	r5l_set_io_unit_state(io, IO_UNIT_IO_START);
> +
> +	r5l_submit_io(log, io);
> +}
> +
> +static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
> +{
> +	struct r5l_io_unit *io;
> +	struct r5l_meta_block *block;
> +	struct bio *bio;
> +
> +	io = r5l_alloc_io_unit(log);
> +
> +	block = page_address(io->meta_page);
> +	block->magic = cpu_to_le32(R5LOG_MAGIC);
> +	block->version = R5LOG_VERSION;
> +	block->block_size = cpu_to_le16(log->block_size);
> +	block->seq = cpu_to_le64(log->seq);
> +	block->position = cpu_to_le64(log->log_start);
> +
> +	io->log_start = log->log_start;
> +	io->meta_offset = sizeof(struct r5l_meta_block);
> +	io->seq = log->seq;
> +
> +	bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL,
> +		bio_get_nr_vecs(r5l_bdev(log)));

bio_alloc uses a mempool so __GFP_NOFAIL is not relevant.
Maybe this should use the mddev's mempool?

> +	io->current_bio = bio;
> +	bio->bi_rw = WRITE;
> +	bio->bi_bdev = r5l_bdev(log);
> +	bio->bi_iter.bi_sector = r5l_block_to_sector(log, log->log_start);
> +	bio_add_page(bio, io->meta_page, log->block_size, 0);
> +	bio->bi_end_io = r5l_log_endio;
> +	bio->bi_private = io;
> +
> +	bio_list_add(&io->bios, bio);
> +	atomic_inc(&io->pending_io);
> +
> +	log->seq++;
> +	log->log_start = r5l_ring_add(log, log->log_start, 1);
> +	io->log_end = log->log_start;
> +	/* current bio hit disk end */
> +	if (log->log_start == log->first_block)
> +		io->current_bio = NULL;
> +
> +	spin_lock(&log->io_list_lock);
> +	list_add_tail(&io->log_sibling, &log->running_ios);
> +	spin_unlock(&log->io_list_lock);
> +
> +	return io;
> +}
> +
> +static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
> +{
> +	struct r5l_io_unit *io;
> +
> +	io = log->current_io;
> +	if (io && io->meta_offset + payload_size > log->block_size)
> +		r5l_submit_current_io(log);
> +	io = log->current_io;
> +	if (io)
> +		return 0;
> +
> +	log->current_io = r5l_new_meta(log);
> +	return 0;
> +}
> +
> +static void r5l_log_pages(struct r5l_log *log, u16 type, sector_t location,
> +	struct page *page1, u32 checksum1,
> +	struct page *page2, u32 checksum2)
> +{
> +	struct r5l_io_unit *io = log->current_io;
> +	struct r5l_payload_data_parity *payload;
> +
> +	payload = page_address(io->meta_page) + io->meta_offset;
> +	payload->header.type = cpu_to_le16(type);
> +	payload->header.flags = cpu_to_le16(0);
> +	payload->blocks = cpu_to_le32(r5l_page_blocks(log, 1 + !!page2));
> +	payload->location = cpu_to_le64(location);
> +	payload->checksum[0] = cpu_to_le32(checksum1);
> +	if (page2)
> +		payload->checksum[1] = cpu_to_le32(checksum2);
> +
> +alloc_bio:
> +	if (!io->current_bio) {
> +		struct bio *bio;
> +		bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL,
> +			bio_get_nr_vecs(r5l_bdev(log)));

ditto - __GFP_NOFAIL not appropriate.


> +		bio->bi_rw = WRITE;
> +		bio->bi_bdev = r5l_bdev(log);
> +		bio->bi_iter.bi_sector = r5l_block_to_sector(log, log->log_start);
> +		bio->bi_end_io = r5l_log_endio;
> +		bio->bi_private = io;
> +		bio_list_add(&io->bios, bio);
> +		atomic_inc(&io->pending_io);
> +		io->current_bio = bio;
> +	}
> +	if (page1) {
> +		if (!bio_add_page(io->current_bio, page1, PAGE_SIZE, 0)) {
> +			io->current_bio = NULL;
> +			goto alloc_bio;
> +		}
> +		log->log_start = r5l_ring_add(log, log->log_start,
> +			r5l_page_blocks(log, 1));
> +		/* current bio hit disk end */
> +		if (log->log_start == log->first_block)
> +			io->current_bio = NULL;
> +	}
> +
> +	page1 = NULL;
> +	if (page2) {
> +		if (io->current_bio == NULL)
> +			goto alloc_bio;
> +		if (!bio_add_page(io->current_bio, page2, PAGE_SIZE, 0)) {
> +			io->current_bio = NULL;
> +			goto alloc_bio;
> +		}
> +		log->log_start = r5l_ring_add(log, log->log_start,
> +			r5l_page_blocks(log, 1));
> +		/* current bio hit disk end */
> +		if (log->log_start == log->first_block)
> +			io->current_bio = NULL;
> +	}
> +
> +	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
> +		sizeof(__le32) * (1 + !!page2);
> +	io->log_end = log->log_start;
> +}

Allowing either 1 or 2 pages to be passed to this function makes it
more complex than I would like.
It would be much nicer if you could just call r5l_log_pages() twice for
the P and Q blocks.


> +
> +static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh)
> +{
> +	int i;
> +	int meta_size;
> +	int write_disks = 0;
> +	int data_pages, parity_pages;
> +	struct r5l_io_unit *io;
> +	int reserve;
> +
> +	for (i = 0; i < sh->disks; i++) {
> +		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
> +			continue;
> +		write_disks++;
> +	}
> +	parity_pages = 1 + !!(sh->qd_idx >= 0);
> +	data_pages = write_disks - parity_pages;
> +
> +	meta_size = (sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
> +		data_pages + sizeof(struct r5l_payload_data_parity) +
> +		sizeof(__le32) * parity_pages;

In quite a lot of places your indentation is not correct.
When you have any sort of bracket/brace/parenthesis, and
the opening bracket is not at the end of the line, then everything
within the brackets must be immediately to the right of the opening
bracket.

I'm happy to fix most of this up when I ultimately accept that patch
(emacs makes it easy).  However this line is just wrong.  A line break
must be at a low-precedence point in the line.  So e.g. break after '+'
rather than after '*' etc.
This should be more like:

meta_size = (sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
	     * data_pages +
	     sizeof(struct r5l_payload_data_parity) +
	     sizeof(__le32) * parity_pages;

which suddenly I can actually read and understand....
Of course if you didn't combine the two P+Q blocks as suggested
earlier, this would become even simpler.


> +
> +	/* meta + data */
> +	reserve = 1 + r5l_page_blocks(log, write_disks);
> +	r5l_get_reserve(log, reserve);
> +
> +	r5l_get_meta(log, meta_size);
> +	io = log->current_io;
> +
> +	for (i = 0; i < sh->disks; i++) {
> +		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
> +			continue;
> +		if (i == sh->pd_idx || i == sh->qd_idx)
> +			continue;
> +		r5l_log_pages(log, R5LOG_PAYLOAD_DATA,
> +			compute_blocknr(sh, i, 0),
> +			sh->dev[i].page, sh->dev[i].log_checksum,
> +			NULL, 0);
> +	}
> +	r5l_log_pages(log, R5LOG_PAYLOAD_PARITY,
> +		sh->sector, sh->dev[sh->pd_idx].page,
> +		sh->dev[sh->pd_idx].log_checksum,
> +		sh->qd_idx >= 0 ? sh->dev[sh->qd_idx].page : NULL,
> +		sh->qd_idx >= 0 ? sh->dev[sh->qd_idx].log_checksum : 0);
> +
> +	list_add_tail(&sh->log_list, &io->stripe_list);
> +	atomic_inc(&io->pending_stripe);
> +	sh->log_io = io;
> +
> +	r5l_put_reserve(log, reserve);
> +}
> +
> +static void r5l_log_thread(struct md_thread *thread)
> +{
> +	struct mddev *mddev = thread->mddev;
> +	struct r5conf *conf = mddev->private;
> +	struct r5l_log *log = conf->log;
> +	struct stripe_head *sh;
> +	LIST_HEAD(list);
> +	struct blk_plug plug;
> +
> +	if (!log)
> +		return;
> +
> +	spin_lock(&log->log_stripes_lock);
> +	list_splice_init(&log->log_stripes, &list);
> +	spin_unlock(&log->log_stripes_lock);
> +
> +	if (list_empty(&list))
> +		return;
> +	mutex_lock(&log->io_mutex);
> +	blk_start_plug(&plug);
> +	while (!list_empty(&list)) {
> +		sh = list_first_entry(&list, struct stripe_head, log_list);
> +		list_del_init(&sh->log_list);
> +		r5l_log_stripe(log, sh);
> +	}
> +	r5l_submit_current_io(log);
> +	blk_finish_plug(&plug);
> +	mutex_unlock(&log->io_mutex);
> +}
> +
> +/*
> + * running in raid5d, where reclaim could wait for raid5d too (when it flushes
> + * data from log to raid disks), so we shouldn't wait for reclaim here
> + * */
> +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
> +{
> +	int write_disks = 0;
> +	int data_pages, parity_pages;
> +	int meta_size;
> +	int i;
> +
> +	if (!log)
> +		return -EAGAIN;
> +	/* Don't support stripe batch */
> +	if (sh->log_io ||!test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
> +	    test_bit(STRIPE_SYNCING, &sh->state))
> +		return -EAGAIN;
> +
> +	for (i = 0; i < sh->disks; i++) {
> +		void *addr;
> +		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
> +			continue;
> +		write_disks++;
> +		addr = kmap_atomic(sh->dev[i].page);
> +		sh->dev[i].log_checksum = r5l_calculate_checksum(log,
> +			log->uuid_checksum, addr, PAGE_SIZE);
> +		kunmap_atomic(addr);
> +	}
> +	parity_pages = 1 + !!(sh->qd_idx >= 0);
> +	data_pages = write_disks - parity_pages;
> +
> +	meta_size = (sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
> +		data_pages + sizeof(struct r5l_payload_data_parity) +
> +		sizeof(__le32) * parity_pages;
> +	/* Doesn't work with very big raid array */
> +	if (meta_size + sizeof(struct r5l_meta_block) >
> +			log->block_size)
> +		return -EINVAL;
> +
> +	atomic_inc(&sh->count);
> +
> +	spin_lock(&log->log_stripes_lock);
> +	list_add_tail(&sh->log_list, &log->log_stripes);
> +	spin_unlock(&log->log_stripes_lock);
> +	return 0;
> +}
> +
> +void r5l_write_stripe_run(struct r5l_log *log)
> +{
> +	if (!log)
> +		return;
> +	md_wakeup_thread(log->log_thread);
> +}

if (log)
   md_wakeup_thread(log->log_thread);
??
Does it need a separate function?

> +
> +static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space)
> +{
> +}
> +

A comment that this will be fleshed out it subsequent patch wouldn't
hurt, but isn't entirely necessary.


> +static int r5l_recovery_log(struct r5l_log *log)
> +{
> +	/* fake recovery */
> +	log->seq = log->last_cp_seq + 1;
> +	log->log_start = r5l_ring_add(log, log->last_checkpoint, 1);
> +	return 0;
> +}
> +
> +static void r5l_write_super(struct r5l_log *log, sector_t cp)
> +{
> +	log->rdev->recovery_offset = cp;
> +	md_update_sb(log->mddev, 1);
> +}

This is only called from run() when the log is first initialised.  At
this point there is nothing useful in the log, so recording it's
location is pointless.  At most you  could set the MD_SB_DIRTY flag (or
whatever it is).
So it really doesn't need to be a separate function.



> +
> +static int r5l_load_log(struct r5l_log *log)
> +{
> +	struct md_rdev *rdev = log->rdev;
> +	struct page *page;
> +	struct r5l_meta_block *mb;
> +	sector_t cp = log->rdev->recovery_offset;
> +	u32 stored_crc, expected_crc;
> +	bool create_super = false;
> +	int ret;
> +
> +	/* Make sure it's valid */
> +	if (cp >= rdev->sectors)
> +		cp = 0;
> +	page = alloc_page(GFP_KERNEL);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
> +		ret = -EIO;
> +		goto ioerr;
> +	}
> +	mb = page_address(page);
> +
> +	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
> +		mb->version != R5LOG_VERSION ||
> +		le16_to_cpu(mb->block_size) > PAGE_SIZE) {
> +		create_super = true;
> +		goto create;
> +	}
> +	stored_crc = le32_to_cpu(mb->checksum);
> +	mb->checksum = 0;
> +	expected_crc = r5l_calculate_checksum(log, log->uuid_checksum,
> +		mb, le16_to_cpu(mb->block_size));
> +	if (stored_crc != expected_crc) {
> +		create_super = true;
> +		goto create;
> +	}
> +	if (le64_to_cpu(mb->position) * (le16_to_cpu(mb->block_size) >> 9) !=
> +		cp) {
> +		create_super = true;
> +		goto create;
> +	}
> +create:
> +	if (create_super) {
> +		log->block_size = PAGE_SIZE;
> +		log->last_cp_seq = prandom_u32();
> +		cp = (cp >> PAGE_SECTOR_SHIFT) << PAGE_SECTOR_SHIFT;

This isn't the way we normally round down - we have the "round_down()"
macro for that.

But if you are creating a new log, why not just set 'cp' to zero??


> +		/* Make sure super points to correct address */
> +		r5l_write_super(log, cp);
> +	} else {
> +		log->block_size = le16_to_cpu(mb->block_size);
> +		log->last_cp_seq = le64_to_cpu(mb->seq);
> +	}
> +	log->block_sector_shift = ilog2(log->block_size >> 9);
> +	log->page_block_shift = PAGE_SHIFT - ilog2(log->block_size);
> +
> +	log->first_block = 0;
> +	log->total_blocks = r5l_sector_to_block(log, rdev->sectors);
> +	log->last_block = log->first_block + log->total_blocks;
> +	log->last_checkpoint = r5l_sector_to_block(log, cp);
> +
> +	__free_page(page);
> +
> +	return r5l_recovery_log(log);
> +ioerr:
> +	__free_page(page);
> +	return ret;
> +}
> +
> +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
> +{
> +	struct r5l_log *log;
> +
> +	log = kzalloc(sizeof(*log), GFP_KERNEL);
> +	if (!log)
> +		return -ENOMEM;
> +	log->mddev = rdev->mddev;
> +	log->rdev = rdev;
> +
> +	log->uuid_checksum = r5l_calculate_checksum(log, ~0, rdev->mddev->uuid,
> +		sizeof(rdev->mddev->uuid));
> +
> +	init_waitqueue_head(&log->space_waitq);
> +	mutex_init(&log->io_mutex);
> +
> +	spin_lock_init(&log->io_list_lock);
> +	INIT_LIST_HEAD(&log->running_ios);
> +
> +	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
> +	if (!log->io_kc)
> +		goto io_kc;
> +
> +	INIT_LIST_HEAD(&log->log_stripes);
> +	spin_lock_init(&log->log_stripes_lock);
> +	log->log_thread = md_register_thread(r5l_log_thread,
> +		log->mddev, "log");
> +	if (!log->log_thread)
> +		goto log_thread;
> +
> +	if (r5l_load_log(log))
> +		goto error;
> +
> +	conf->log = log;
> +	return 0;
> +error:
> +	md_unregister_thread(&log->log_thread);
> +log_thread:
> +	kmem_cache_destroy(log->io_kc);
> +io_kc:
> +	kfree(log);
> +	return -EINVAL;
> +}
> +
> +void r5l_exit_log(struct r5l_log *log)
> +{
> +	md_unregister_thread(&log->log_thread);
> +
> +	kmem_cache_destroy(log->io_kc);
> +	kfree(log);
> +}
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 59e44e9..9608a44 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -899,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  
>  	might_sleep();
>  
> +	if (!r5l_write_stripe(conf->log, sh))
> +		return;

If no log is configured, r5l_write_stripe will return -EAGAIN, and so
ops_run_io will never submit any IO....



>  	for (i = disks; i--; ) {
>  		int rw;
>  		int replace_only = 0;
> @@ -2478,8 +2480,6 @@ static void raid5_end_write_request(struct bio *bi, int error)
>  		release_stripe(sh->batch_head);
>  }
>  
> -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
> -
>  static void raid5_build_block(struct stripe_head *sh, int i, int previous)
>  {
>  	struct r5dev *dev = &sh->dev[i];
> @@ -2729,7 +2729,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
>  	return new_sector;
>  }
>  
> -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
> +sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
>  {
>  	struct r5conf *conf = sh->raid_conf;
>  	int raid_disks = sh->disks;
> @@ -3498,6 +3498,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
>  			WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
>  			WARN_ON(dev->page != dev->orig_page);
>  		}
> +
>  	if (!discard_pending &&
>  	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
>  		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> @@ -5746,6 +5747,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
>  
>  	for (i = 0; i < batch_size; i++)
>  		handle_stripe(batch[i]);
> +	r5l_write_stripe_run(conf->log);
>  
>  	cond_resched();
>  
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 02c3bf8..a8daf39 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -223,6 +223,9 @@ struct stripe_head {
>  	struct stripe_head	*batch_head; /* protected by stripe lock */
>  	spinlock_t		batch_lock; /* only header's lock is useful */
>  	struct list_head	batch_list; /* protected by head's batch lock*/
> +
> +	struct r5l_io_unit	*log_io;
> +	struct list_head	log_list;
>  	/**
>  	 * struct stripe_operations
>  	 * @target - STRIPE_OP_COMPUTE_BLK target

I wonder if we really need yet another 'list_head' in 'stripe_head'.
I guess one more is no great cost.



> @@ -244,6 +247,7 @@ struct stripe_head {
>  		struct bio	*toread, *read, *towrite, *written;
>  		sector_t	sector;			/* sector of this page */
>  		unsigned long	flags;
> +		u32		log_checksum;
>  	} dev[1]; /* allocated with extra space depending of RAID geometry */
>  };
>  
> @@ -539,6 +543,7 @@ struct r5conf {
>  	struct r5worker_group	*worker_groups;
>  	int			group_cnt;
>  	int			worker_cnt_per_group;
> +	struct r5l_log		*log;
>  };
>  
>  
> @@ -605,4 +610,9 @@ static inline int algorithm_is_DDF(int layout)
>  
>  extern void md_raid5_kick_device(struct r5conf *conf);
>  extern int raid5_set_cache_size(struct mddev *mddev, int size);
> +extern sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);

When making static functions extern, you need to make sure the module
name is mentioned somehow.  So raid5_compute_blocknr, or r5_.. or
md_... 

> +extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
> +extern void r5l_exit_log(struct r5l_log *log);
> +extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
> +extern void r5l_write_stripe_run(struct r5l_log *log);
>  #endif
> diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
> index 8c8e12c..418b1ba 100644
> --- a/include/uapi/linux/raid/md_p.h
> +++ b/include/uapi/linux/raid/md_p.h
> @@ -315,4 +315,52 @@ struct mdp_superblock_1 {
>  					|MD_FEATURE_WRITE_CACHE		\
>  					)
>  
> +struct r5l_payload_header {
> +	__le16 type;
> +	__le16 flags;
> +} __attribute__ ((__packed__));
> +
> +enum {
> +	R5LOG_PAYLOAD_DATA = 0,
> +	R5LOG_PAYLOAD_PARITY = 1,
> +	R5LOG_PAYLOAD_FLUSH = 2,
> +};

I would really like it if this was 
   enum r5l_payload_type {

so I knew immediately where the numbers would appear.
Then have the two different 'flags' enums immediately afterwards also
with useful names.


> +
> +struct r5l_payload_data_parity {
> +	struct r5l_payload_header header;
> +	__le32 blocks; /* block. data/parity size. each 4k has a checksum */
> +	__le64 location; /* sector. For data, it's raid sector. For
> +				parity, it's stripe sector */
> +	__le32 checksum[];
> +} __attribute__ ((__packed__));
> +
> +enum {
> +	R5LOG_PAYLOAD_FLAG_DISCARD = 1,
> +};
> +
> +struct r5l_payload_flush {
> +	struct r5l_payload_header header;
> +	__le32 size; /* flush_stripes size, bytes */
> +	__le64 flush_stripes[];
> +} __attribute__ ((__packed__));
> +
> +enum {
> +	R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
> +};
> +
> +struct r5l_meta_block {
> +	__le32 magic;
> +	__le32 checksum;
> +	__u8 version;
> +	__u8 __zero_pading;
> +	__le16 block_size; /* 512B - 4k */
> +	__le32 meta_size; /* whole size of the block */
> +
> +	__le64 seq;
> +	__le64 position; /* block, start from rdev->data_offset, current position */
> +	struct r5l_payload_header payloads[];
> +} __attribute__ ((__packed__));
> +
> +#define R5LOG_VERSION 0x1
> +#define R5LOG_MAGIC 0x6433c509
>  #endif

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 4/9] raid5: log reclaim support
  2015-07-30  0:38 ` [PATCH 4/9] raid5: log reclaim support Shaohua Li
@ 2015-08-05  3:43   ` NeilBrown
  2015-08-05 21:34     ` Shaohua Li
  2015-08-05  3:52   ` NeilBrown
  1 sibling, 1 reply; 30+ messages in thread
From: NeilBrown @ 2015-08-05  3:43 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:44 -0700 Shaohua Li <shli@fb.com> wrote:

> This is the reclaim support for raid5 log. A stripe write will have
> following steps:
> 
> 1. reconstruct the stripe, read data/calculate parity. ops_run_io
> prepares to write data/parity to raid disks
> 2. hijack ops_run_io. stripe data/parity is appending to log disk
> 3. flush log disk cache
> 4. ops_run_io run again and do normal operation. stripe data/parity is
> written in raid array disks. raid core can return io to upper layer.
> 5. flush cache of all raid array disks
> 6. update super block
> 7. log disk space used by the stripe can be reused
> 
> In practice, several stripes consist of an io_unit and we will batch
> several io_unit in different steps, but the whole process doesn't
> change.
> 
> It's possible io return just after data/parity hit log disk, but then
> read IO will need read from log disk. For simplicity, IO return happens
> at step 4, where read IO can directly read from raid disks.
> 
> Currently reclaim run every minute or out of space. Reclaim is just to
> free log disk spaces, it doesn't impact data consistency.

Having arbitrary times lines "every minute" is a warning sign.
"As soon as possible" and "Just it time" can both make sense easily.
"every minute" needs more justification.

I'll probably say more when I find the code.


> 
> Recovery make sure raid disks and log disk have the same data of a
> stripe. If crash happens before 4, recovery might/might not recovery
> stripe's data/parity depending on if data/parity and its checksum
> matches. In either case, this doesn't change the syntax of an IO write.
> After step 3, stripe is guaranteed recoverable, because stripe's
> data/parity is persistent in log disk. In some cases, log disk content
> and raid disks content of a stripe are the same, but recovery will still
> copy log disk content to raid disks, this doesn't impact data
> consistency. space reuse happens after superblock update and cache
> flush.
> 
> There is one situation we want to avoid. A broken meta in the middle of
> a log causes recovery can't find meta at the head of log. If operations
> require meta at the head persistent in log, we must make sure meta
> before it persistent in log too. The case is stripe data/parity is in
> log and we start write stripe to raid disks (before step 4). stripe
> data/parity must be persistent in log before we do the write to raid
> disks. The solution is we restrictly maintain io_unit list order. In
> this case, we only write stripes of an io_unit to raid disks till the
> io_unit is the first one whose data/parity is in log.
> 
> The io_unit list order is important for other cases too. For example,
> some io_unit are reclaimable and others not. They can be mixed in the
> list, we shouldn't reuse space of an unreclaimable io_unit.
> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/raid5-cache.c | 261 +++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/md/raid5.c       |   8 +-
>  drivers/md/raid5.h       |   3 +
>  3 files changed, 271 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 5c9bab6..a418e45 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -25,6 +25,7 @@
>  #include "raid5.h"
>  
>  typedef u64 r5blk_t; /* log blocks, 512B - 4k */
> +#define RECLAIM_TIMEOUT (60 * HZ) /* reclaim run every 60s */
>  
>  struct r5l_log {
>  	struct mddev *mddev;
> @@ -54,9 +55,14 @@ struct r5l_log {
>  
>  	spinlock_t io_list_lock;
>  	struct list_head running_ios; /* running io_unit list */
> +	struct list_head io_end_ios; /* io end io_unit list */
> +	struct list_head stripe_end_ios; /* stripe end io_unit list */

When the comment has nearly the same words as the code, it isn't adding
much, a bit like:
    i++;  /* increment i */

Maybe you mean:

   struct list_head running_ios; /* io_units which are still running,
                                  * and have not yet been completely
                                  * written to the log. */
   struct list_head io_end_ios;  /* io_units which have been completely
                                  * written to the log but not yet
                                  * written to the RAID */
   struct list_head stripe_end_ios;/* io_units which have been
                                    * completely written to the RAID
                                    * but have not yet been considered
                                    * for updating the start of the
                                    * log. */

>  
>  	struct kmem_cache *io_kc;
>  
> +	struct md_thread *reclaim_thread;
> +	r5blk_t reclaim_target; /* 0 means reclaiming possible io_unit */
> +

by "target" I think you mean "number of blocks that need to be
reclaimed".  I can't quite guess what a "possible io_unit" is.

    int reclaim_target; /* number of blocks that need to be reclaimed
                         * promptly.  If 0, then .....
                         */

>  	struct md_thread *log_thread;
>  	struct list_head log_stripes;
>  	spinlock_t log_stripes_lock;
> @@ -537,8 +543,246 @@ void r5l_write_stripe_run(struct r5l_log *log)
>  	md_wakeup_thread(log->log_thread);
>  }
>  
> +void r5l_stripe_write_finished(struct stripe_head *sh)
> +{
> +	struct r5l_io_unit *io;
> +
> +	/* Don't support stripe batch */
> +	io = sh->log_io;
> +	if (!io)
> +		return;
> +	sh->log_io = NULL;
> +
> +	if (!atomic_dec_and_test(&io->pending_stripe))
> +		return;
> +	r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);

 if (atomic_dec_and_test(...))
          r5l_set_io_unit_state(...);

??

> +}
> +
> +static void r5l_compress_stripe_end_list(struct r5l_log *log)
> +{
> +	struct r5l_io_unit *first, *last, *io;
> +
> +	if (list_empty(&log->stripe_end_ios))
> +		return;
> +	first = list_first_entry(&log->stripe_end_ios,
> +		struct r5l_io_unit, log_sibling);
> +	last = list_last_entry(&log->stripe_end_ios,
> +		struct r5l_io_unit, log_sibling);
> +	/* Keep 2 io_unit in the list, superblock points to the last one */
> +	if (first == last)
> +		return;
> +	list_del(&first->log_sibling);
> +	list_del(&last->log_sibling);
> +	while (!list_empty(&log->stripe_end_ios)) {
> +		io = list_first_entry(&log->stripe_end_ios,
> +			struct r5l_io_unit, log_sibling);
> +		list_del(&io->log_sibling);
> +		first->log_end = io->log_end;
> +		r5l_free_io_unit(log, io);
> +	}
> +	list_add_tail(&first->log_sibling, &log->stripe_end_ios);
> +	list_add_tail(&last->log_sibling, &log->stripe_end_ios);
> +}

A comment explaining why you might want to compress a list (which I
think means to only keep the first and last) would be quite helpful
here.

> +
> +static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
> +	int state)
> +{
> +	struct r5l_io_unit *io;
> +
> +	while (!list_empty(from)) {
> +		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
> +		/* don't change list order */
> +		if (io->state >= state)
> +			list_move_tail(&io->log_sibling, to);
> +		else
> +			break;
> +	}
> +}
> +
> +/*
> + * Starting dispatch IO to raid.
> + * io_unit(meta) consists of a log. There is one situation we want to avoid. A
> + * broken meta in the middle of a log causes recovery can't find meta at the
> + * head of log. If operations require meta at the head persistent in log, we
> + * must make sure meta before it persistent in log too. A case is:
> + *
> + * stripe data/parity is in log, we start write stripe to raid disks. stripe
> + * data/parity must be persistent in log before we do the write to raid disks.
> + *
> + * The solution is we restrictly maintain io_unit list order. In this case, we
> + * only write stripes of an io_unit to raid disks till the io_unit is the first
> + * one whose data/parity is in log.
> + * */
> +void r5l_flush_stripe_to_raid(struct r5l_log *log)
> +{
> +	struct r5l_io_unit *io;
> +	struct stripe_head *sh;
> +	bool run_stripe;
> +
> +	if (!log)
> +		return;
> +	/* find io_unit with io end but stripes are not running */
> +	spin_lock(&log->io_list_lock);
> +	r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
> +			IO_UNIT_IO_END);
> +	r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
> +			IO_UNIT_STRIPE_END);
> +	r5l_compress_stripe_end_list(log);
> +	run_stripe = !list_empty(&log->io_end_ios);
> +	spin_unlock(&log->io_list_lock);
> +
> +	if (!run_stripe)
> +		return;
> +
> +	blkdev_issue_flush(r5l_bdev(log), GFP_NOIO, NULL);
> +
> +	spin_lock(&log->io_list_lock);
> +	list_for_each_entry(io, &log->io_end_ios, log_sibling) {
> +		if (io->state >= IO_UNIT_STRIPE_START)
> +			continue;
> +		r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START);
> +
> +		while (!list_empty(&io->stripe_list)) {
> +			sh = list_first_entry(&io->stripe_list,
> +				struct stripe_head, log_list);
> +			list_del_init(&sh->log_list);
> +			set_bit(STRIPE_HANDLE, &sh->state);
> +			release_stripe(sh);

This code makes me a bit nervous.  handle_stripe() can potentially be
called on any stripe at any time.
Here you are scheduling a call the handle_stripe() without obviously
changing the state of the stripe.  So whatever is going to happen now
could potentially have happened before... is that safe?


> +		}
> +	}
> +	spin_unlock(&log->io_list_lock);
> +}
> +
> +static void r5l_disks_flush_end(struct bio *bio, int err)
> +{
> +	struct completion *io_complete = bio->bi_private;
> +
> +	complete(io_complete);
> +	bio_put(bio);
> +}
> +
> +static void r5l_flush_all_disks(struct r5l_log *log)
> +{
> +	struct mddev *mddev = log->mddev;
> +	struct bio *bi;
> +	DECLARE_COMPLETION_ONSTACK(io_complete);
> +
> +	bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
> +	bi->bi_end_io = r5l_disks_flush_end;
> +	bi->bi_private = &io_complete;
> +
> +	/* If bio hasn't payload, this function will just flush all disks */
> +	md_flush_request(mddev, bi);
> +
> +	wait_for_completion_io(&io_complete);
> +}
> +
> +static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
> +{
> +	/* the log thread will log the io unit */
> +	r5l_wait_io_unit_state(io, IO_UNIT_IO_END);
> +	if (io->state < IO_UNIT_STRIPE_START)
> +		r5l_flush_stripe_to_raid(log);
> +	r5l_wait_io_unit_state(io, IO_UNIT_STRIPE_END);
> +}
> +
> +static void r5l_write_super(struct r5l_log *log, sector_t cp);
> +static void r5l_do_reclaim(struct r5l_log *log)
> +{
> +	struct r5l_io_unit *io, *last;
> +	LIST_HEAD(list);
> +	r5blk_t free = 0;
> +	r5blk_t reclaim_target = xchg(&log->reclaim_target, 0);
> +
> +	spin_lock(&log->io_list_lock);
> +	/*
> +	 * move proper io_unit to reclaim list. We should not change the order.
> +	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
> +	 * shouldn't reuse space of an unreclaimable io_unit
> +	 * */
> +	while (1) {
> +		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
> +			IO_UNIT_IO_END);
> +		r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
> +				IO_UNIT_STRIPE_END);
> +		while (!list_empty(&log->stripe_end_ios)) {
> +			io = list_first_entry(&log->stripe_end_ios,
> +				struct r5l_io_unit, log_sibling);
> +			list_move_tail(&io->log_sibling, &list);
> +			free += (io->log_end - io->log_start +
> +				log->total_blocks) % log->total_blocks;
> +		}
> +
> +		if (free >= reclaim_target || (list_empty(&log->running_ios) &&
> +		    list_empty(&log->io_end_ios) &&
> +		    list_empty(&log->stripe_end_ios)))
> +			break;
> +
> +		if (!list_empty(&log->io_end_ios)) {
> +			io = list_first_entry(&log->io_end_ios,
> +				struct r5l_io_unit, log_sibling);
> +			spin_unlock(&log->io_list_lock);
> +			/* nobody else can delete the io, we are safe */
> +			r5l_kick_io_unit(log, io);
> +			spin_lock(&log->io_list_lock);
> +			continue;
> +		}
> +
> +		if (!list_empty(&log->running_ios)) {
> +			io = list_first_entry(&log->running_ios,
> +				struct r5l_io_unit, log_sibling);
> +			spin_unlock(&log->io_list_lock);
> +			/* nobody else can delete the io, we are safe */
> +			r5l_kick_io_unit(log, io);
> +			spin_lock(&log->io_list_lock);
> +			continue;
> +		}
> +	}
> +	spin_unlock(&log->io_list_lock);

Well, here we are with the important parts of the reclaim code...

The main result of the above section is to possibly call
r5l_flush_stripe_to_raid() a few times, and to wait until 'list'
contains enough io_units to satisfy the requirement.

As raid5d already calls r5l_flush_stripe_to_raid - which it really must
to make sure that writes complete quickly - this really comes down to
some book keeping and some waiting.
Book keeping can be done as changes happen, and waiting is best not
done at all.

To be more specific: when an io_unit transitions to IO_UNIT_STRIPE_END
it can immediately be removed from the list and if it was the first
io_unit on the list, then the log_start can immediately be updated.


> +
> +	if (list_empty(&list))
> +		return;
> +
> +	r5l_flush_all_disks(log);
> +
> +	/* super always point to last valid meta */
> +	last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
> +	r5l_write_super(log, r5l_block_to_sector(log, last->log_start));

This bit flushes all the disks and then updates the metadata and writes
it.
As md_super_write already uses WRITE_FLUSH_FUA I don't think the extra
flush is needed.

I really think you should just update ->recovery_offset and set
MD_CHANGE_DEVS (or similar) and let the update happen.


> +
> +	mutex_lock(&log->io_mutex);
> +	log->last_checkpoint = last->log_start;
> +	log->last_cp_seq = last->seq;
> +	mutex_unlock(&log->io_mutex);
> +	wake_up(&log->space_waitq);
> +
> +	while (!list_empty(&list)) {
> +		io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
> +		list_del(&io->log_sibling);
> +		r5l_free_io_unit(log, io);
> +	}
> +}

So I really think all of this can be done as-it-happens (the
book-keeping) or asynchronously.  There is no need to push something
every minute.



> +
> +static void r5l_reclaim_thread(struct md_thread *thread)
> +{
> +	struct mddev *mddev = thread->mddev;
> +	struct r5conf *conf = mddev->private;
> +	struct r5l_log *log = conf->log;
> +
> +	if (!log)
> +		return;
> +	r5l_do_reclaim(log);
> +}
> +
>  static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space)
>  {
> +	r5blk_t target;
> +
> +	do {
> +		target = log->reclaim_target;
> +		if (space < target)
> +			return;
> +	} while (cmpxchg(&log->reclaim_target, target, space) != target);
> +	md_wakeup_thread(log->reclaim_thread);
>  }
>  
>  static int r5l_recovery_log(struct r5l_log *log)
> @@ -642,11 +886,19 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  
>  	spin_lock_init(&log->io_list_lock);
>  	INIT_LIST_HEAD(&log->running_ios);
> +	INIT_LIST_HEAD(&log->io_end_ios);
> +	INIT_LIST_HEAD(&log->stripe_end_ios);
>  
>  	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
>  	if (!log->io_kc)
>  		goto io_kc;
>  
> +	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
> +		log->mddev, "reclaim");
> +	if (!log->reclaim_thread)
> +		goto reclaim_thread;
> +	log->reclaim_thread->timeout = RECLAIM_TIMEOUT;
> +
>  	INIT_LIST_HEAD(&log->log_stripes);
>  	spin_lock_init(&log->log_stripes_lock);
>  	log->log_thread = md_register_thread(r5l_log_thread,
> @@ -662,6 +914,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  error:
>  	md_unregister_thread(&log->log_thread);
>  log_thread:
> +	md_unregister_thread(&log->reclaim_thread);
> +reclaim_thread:
>  	kmem_cache_destroy(log->io_kc);
>  io_kc:
>  	kfree(log);
> @@ -670,6 +924,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  
>  void r5l_exit_log(struct r5l_log *log)
>  {
> +	/*
> +	 * at this point all stripes are finished, so io_unit is at least in
> +	 * STRIPE_END state
> +	 * */
> +	r5l_wake_reclaim(log, -1L);
> +	md_unregister_thread(&log->reclaim_thread);
> +	r5l_do_reclaim(log);
>  	md_unregister_thread(&log->log_thread);
>  
>  	kmem_cache_destroy(log->io_kc);
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 9608a44..d1ddd31 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -417,7 +417,7 @@ static int release_stripe_list(struct r5conf *conf,
>  	return count;
>  }
>  
> -static void release_stripe(struct stripe_head *sh)
> +void release_stripe(struct stripe_head *sh)
>  {
>  	struct r5conf *conf = sh->raid_conf;
>  	unsigned long flags;
> @@ -3101,6 +3101,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
>  		if (bi)
>  			bitmap_end = 1;
>  
> +		r5l_stripe_write_finished(sh);
> +
>  		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
>  			wake_up(&conf->wait_for_overlap);
>  
> @@ -3499,6 +3501,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
>  			WARN_ON(dev->page != dev->orig_page);
>  		}
>  
> +	r5l_stripe_write_finished(sh);
> +
>  	if (!discard_pending &&
>  	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
>  		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> @@ -5867,6 +5871,8 @@ static void raid5d(struct md_thread *thread)
>  		set_bit(R5_DID_ALLOC, &conf->cache_state);
>  	}
>  
> +	r5l_flush_stripe_to_raid(conf->log);
> +
>  	async_tx_issue_pending_all();
>  	blk_finish_plug(&plug);
>  
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index a8daf39..23cc9c3 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -611,8 +611,11 @@ static inline int algorithm_is_DDF(int layout)
>  extern void md_raid5_kick_device(struct r5conf *conf);
>  extern int raid5_set_cache_size(struct mddev *mddev, int size);
>  extern sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
> +extern void release_stripe(struct stripe_head *sh);
>  extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
>  extern void r5l_exit_log(struct r5l_log *log);
>  extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
>  extern void r5l_write_stripe_run(struct r5l_log *log);
> +extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
> +extern void r5l_stripe_write_finished(struct stripe_head *sh);
>  #endif

Thanks,
NeilBrown


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 4/9] raid5: log reclaim support
  2015-07-30  0:38 ` [PATCH 4/9] raid5: log reclaim support Shaohua Li
  2015-08-05  3:43   ` NeilBrown
@ 2015-08-05  3:52   ` NeilBrown
  1 sibling, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-05  3:52 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:44 -0700 Shaohua Li <shli@fb.com> wrote:

> +		while (!list_empty(&log->stripe_end_ios)) {
> +			io = list_first_entry(&log->stripe_end_ios,
> +				struct r5l_io_unit, log_sibling);
> +			list_move_tail(&io->log_sibling, &list);
> +			free += (io->log_end - io->log_start +
> +				log->total_blocks) % log->total_blocks;
> +		}

sorry, forgot to mention this bit.

That '%' is acting on 64 but numbers, so it won't build in a 32bit
machine.
Maybe use SECTOR_DIV, maybe do an 
   if (x > y) free = x-y else free = x+total-y;

or something.

NeilBrown


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/9] raid5: log recovery
  2015-07-30  0:38 ` [PATCH 5/9] raid5: log recovery Shaohua Li
@ 2015-08-05  4:05   ` NeilBrown
  2015-08-05 21:39     ` Shaohua Li
  0 siblings, 1 reply; 30+ messages in thread
From: NeilBrown @ 2015-08-05  4:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:45 -0700 Shaohua Li <shli@fb.com> wrote:

> This is the log recovery support. The process is quite straightforward.
> We scan the log and read all valid meta/data/parity into memory. If a
> stripe's data/parity checksum is correct, the stripe will be recoveried.
> Otherwise, it's discarded and we don't scan the log further. The reclaim
> process guarantees stripe which starts to be flushed raid disks has
> completed data/parity and has correct checksum. To recovery a stripe, we
> just copy its data/parity to corresponding raid disks.
> 
> The trick thing is superblock update after recovery. we can't let
> superblock point to last valid meta block. The log might look like:
> | meta 1| meta 2| meta 3|
> meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If superblock
> points to meta 1, we write a new valid meta 2n.  If crash happens again,
> new recovery will start from meta 1. Since meta 2n is valid, recovery
> will think meta 3 is valid, which is wrong.  The solution is we create a
> new meta in meta2 with its seq == meta 1's seq + 2 and let superblock
> points to meta2.  recovery will not think meta 3 is a valid meta,
> because its seq is wrong

I like the idea of using a slightly larger 'seq' to avoid collisions -
except that I would probably feel safer with a much larger seq. May add
1024 or something (at least 10).

> 
> TODO:
> -recovery should run the stripe cache state machine in case of disk
> breakage.

Why?

when you write to the log, you write all of the blocks that need
updating, whether they are destined for a failed device or not.

When you recover, you then have all the blocks that you might want to
write.  So write all the ones for which you have working devices, and
ignore the rest.

Did I miss something?

Not that I object, but if it works....



> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/raid5-cache.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/md/raid5.c       |   4 +-
>  drivers/md/raid5.h       |   6 +
>  3 files changed, 315 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index a418e45..17dab66 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -785,11 +785,315 @@ static void r5l_wake_reclaim(struct r5l_log *log, r5blk_t space)
>  	md_wakeup_thread(log->reclaim_thread);
>  }
>  
> +struct r5l_recovery_ctx {
> +	struct page *meta_page;
> +	unsigned int meta_total_blocks;
> +	r5blk_t pos;
> +	u64 seq;
> +};
> +
> +static inline sector_t r5l_sector_to_stripe_sector(struct r5l_log *log,
> +       sector_t sect)
> +{
> +	struct r5conf *conf = log->mddev->private;
> +	int dd;
> +	return raid5_compute_sector(conf, sect, 0, &dd, NULL);
> +}
> +
> +static int r5l_read_meta_block(struct r5l_log *log,
> +	struct r5l_recovery_ctx *ctx)
> +{
> +	struct r5conf *conf = log->mddev->private;
> +	struct page *page = ctx->meta_page;
> +	struct r5l_meta_block *mb;
> +	u32 crc, stored_crc;
> +	struct r5l_payload_header *header;
> +	int next_type = -1;
> +	int last_type = -1;
> +	sector_t last_stripe_sector = 0;
> +	int offset;
> +
> +	if (!sync_page_io(log->rdev, r5l_block_to_sector(log, ctx->pos),
> +	    log->block_size, page, READ, false))
> +		return -EIO;
> +
> +	mb = page_address(page);
> +	stored_crc = le32_to_cpu(mb->checksum);
> +	mb->checksum = 0;
> +
> +	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
> +	    le64_to_cpu(mb->seq) != ctx->seq ||
> +	    mb->version != R5LOG_VERSION ||
> +	    le64_to_cpu(mb->position) != ctx->pos)
> +		return -EINVAL;
> +
> +	crc = r5l_calculate_checksum(log, log->uuid_checksum,
> +			mb, log->block_size);
> +	if (stored_crc != crc)
> +		return -EINVAL;
> +
> +	if (le32_to_cpu(mb->meta_size) > log->block_size)
> +		return -EINVAL;
> +
> +	ctx->meta_total_blocks = 1;
> +	offset = sizeof(struct r5l_meta_block);
> +	while (offset < le32_to_cpu(mb->meta_size)) {
> +		u16 type;
> +		header = page_address(page) + offset;
> +		type = le16_to_cpu(header->type);
> +
> +		if (next_type != -1 && type != next_type)
> +			return -EINVAL;
> +		if (type == R5LOG_PAYLOAD_DATA) {
> +			struct r5l_payload_data_parity *payload;
> +
> +			payload = (struct r5l_payload_data_parity *)header;
> +			if (le32_to_cpu(payload->blocks) != r5l_page_blocks(log, 1))
> +				return -EINVAL;
> +			if (last_type != -1) {
> +				if (r5l_sector_to_stripe_sector(log,
> +				    le64_to_cpu(payload->location)) !=
> +				    last_stripe_sector)
> +					return -EINVAL;
> +			} else
> +				last_stripe_sector =
> +					r5l_sector_to_stripe_sector(log,
> +						le64_to_cpu(payload->location));
> +
> +			ctx->meta_total_blocks += r5l_page_blocks(log, 1);
> +			next_type = -1;
> +			last_type = type;
> +			offset += sizeof(struct r5l_payload_data_parity) +
> +				sizeof(__le32);
> +		} else if (type == R5LOG_PAYLOAD_PARITY) {
> +			struct r5l_payload_data_parity *payload;
> +
> +			payload = (struct r5l_payload_data_parity *)header;
> +			if (last_type == -1)
> +				return -EINVAL;
> +
> +			if (le32_to_cpu(payload->blocks) !=
> +			    r5l_page_blocks(log, conf->max_degraded))
> +				return -EINVAL;
> +			if (le64_to_cpu(payload->location) != last_stripe_sector)
> +				return -EINVAL;
> +
> +			ctx->meta_total_blocks += r5l_page_blocks(log,
> +				conf->max_degraded);
> +			next_type = R5LOG_PAYLOAD_DATA;
> +			last_type = -1;
> +			offset += sizeof(struct r5l_payload_data_parity) +
> +				sizeof(__le32) * conf->max_degraded;
> +		} else
> +			return -EINVAL;
> +	}
> +	if (offset > le32_to_cpu(mb->meta_size))
> +		return -EINVAL;
> +
> +	return 0;
> +}

I'm not sure the next_type/last_type stuff really helps.  You are just
checking that there is at least one data block between pairs of parity
blocks, but that is a fairly weak test - there are plenty of other ways
that things could go wrong.
I think just trust that the data was written correctly, and that
checksum will detect the unlikely corruption.

> +
> +static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
> +	struct r5l_recovery_ctx *ctx, sector_t stripe_sect,
> +	int *offset, r5blk_t *log_offset)
> +{
> +	struct r5conf *conf = log->mddev->private;
> +	struct stripe_head *dummy;
> +	struct r5l_payload_data_parity *payload;
> +	int disk_index;
> +
> +	dummy = get_active_stripe(conf, stripe_sect, 0, 0, 0);

Why is this called 'dummy'.  It is a real stripe and should be called
'sh'

> +	while (1) {
> +		payload = page_address(ctx->meta_page) + *offset;
> +
> +		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
> +			raid5_compute_sector(conf,
> +				le64_to_cpu(payload->location), 0,
> +				&disk_index, dummy);
> +
> +			sync_page_io(log->rdev, r5l_block_to_sector(log,
> +				*log_offset), PAGE_SIZE,
> +				dummy->dev[disk_index].page, READ, false);
> +			dummy->dev[disk_index].log_checksum =
> +				le32_to_cpu(payload->checksum[0]);
> +			set_bit(R5_Wantwrite, &dummy->dev[disk_index].flags);
> +		} else {
> +			disk_index = dummy->pd_idx;
> +			sync_page_io(log->rdev, r5l_block_to_sector(log,
> +				*log_offset), PAGE_SIZE,
> +				dummy->dev[disk_index].page, READ, false);
> +			dummy->dev[disk_index].log_checksum =
> +				le32_to_cpu(payload->checksum[0]);
> +			set_bit(R5_Wantwrite, &dummy->dev[disk_index].flags);
> +
> +			if (dummy->qd_idx >= 0) {
> +				disk_index = dummy->qd_idx;
> +				sync_page_io(log->rdev, r5l_block_to_sector(log,
> +					r5l_ring_add(log, *log_offset,
> +						r5l_page_blocks(log, 1))),
> +					PAGE_SIZE,
> +					dummy->dev[disk_index].page,
> +					READ, false);
> +				dummy->dev[disk_index].log_checksum =
> +					le32_to_cpu(payload->checksum[1]);
> +				set_bit(R5_Wantwrite,
> +					&dummy->dev[disk_index].flags);
> +			}
> +		}
> +
> +		*log_offset = r5l_ring_add(log, *log_offset,
> +			le32_to_cpu(payload->blocks));
> +		*offset += sizeof(struct r5l_payload_data_parity) +
> +			sizeof(__le32) * (le32_to_cpu(payload->blocks) >>
> +			log->page_block_shift);
> +		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
> +			break;
> +	}
> +
> +	for (disk_index = 0; disk_index < dummy->disks; disk_index++) {
> +		void *addr;
> +		u32 checksum;
> +
> +		if (!test_bit(R5_Wantwrite, &dummy->dev[disk_index].flags))
> +			continue;
> +		addr = kmap_atomic(dummy->dev[disk_index].page);
> +		checksum = r5l_calculate_checksum(log,
> +			log->uuid_checksum, addr, PAGE_SIZE);
> +		kunmap_atomic(addr);
> +		if (checksum != dummy->dev[disk_index].log_checksum)
> +			goto error;
> +	}
> +
> +	/* FIXME: let raid core to handle the stripe */
> +	for (disk_index = 0; disk_index < dummy->disks; disk_index++) {
> +		struct md_rdev *rdev, *rrdev;
> +		if (!test_and_clear_bit(R5_Wantwrite,
> +				&dummy->dev[disk_index].flags))
> +			continue;
> +
> +		rdev = rcu_dereference(conf->disks[disk_index].rdev);
> +		sync_page_io(rdev, stripe_sect, PAGE_SIZE,
> +			dummy->dev[disk_index].page, WRITE, false);
> +		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
> +		if (rrdev)
> +			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
> +				dummy->dev[disk_index].page, WRITE, false);
> +	}
> +	release_stripe(dummy);
> +	return 0;
> +
> +error:
> +	for (disk_index = 0; disk_index < dummy->disks; disk_index++)
> +		dummy->dev[disk_index].flags = 0;
> +	release_stripe(dummy);
> +	return -EINVAL;
> +}
> +
> +static int r5l_recovery_flush_one_meta(struct r5l_log *log,
> +	struct r5l_recovery_ctx *ctx)
> +{
> +	struct r5l_payload_data_parity *payload;
> +	struct r5l_meta_block *mb;
> +	int offset;
> +	r5blk_t log_offset;
> +	sector_t stripe_sector;
> +
> +	mb = page_address(ctx->meta_page);
> +	offset = sizeof(struct r5l_meta_block);
> +	log_offset = r5l_ring_add(log, ctx->pos, 1);
> +
> +	while (offset < le32_to_cpu(mb->meta_size)) {
> +		payload = (void *)mb + offset;
> +		stripe_sector = r5l_sector_to_stripe_sector(log,
> +					le64_to_cpu(payload->location));
> +		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
> +		    &offset, &log_offset))
> +			return -EINVAL;
> +	}
> +	return 0;
> +}
> +
> +/* copy data/parity from log to raid disks */
> +static void r5l_recovery_flush_log(struct r5l_log *log,
> +	struct r5l_recovery_ctx *ctx)
> +{
> +	while (1) {
> +		if (r5l_read_meta_block(log, ctx))
> +			return;
> +		if (r5l_recovery_flush_one_meta(log, ctx))
> +			return;
> +		ctx->seq++;
> +		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
> +	}
> +}
> +
> +static int r5l_log_write_empty_meta_block(struct r5l_log *log, r5blk_t pos,
> +	u64 seq)
> +{
> +	struct page *page;
> +	struct r5l_meta_block *mb;
> +	u32 crc;
> +
> +	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +	if (!page)
> +		return -ENOMEM;
> +	mb = page_address(page);
> +	mb->magic = cpu_to_le32(R5LOG_MAGIC);
> +	mb->version = R5LOG_VERSION;
> +	mb->block_size = cpu_to_le16(log->block_size);
> +	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
> +	mb->seq = cpu_to_le64(seq);
> +	mb->position = cpu_to_le64(pos);
> +	crc = r5l_calculate_checksum(log, log->uuid_checksum, mb,
> +			log->block_size);
> +	mb->checksum = cpu_to_le32(crc);
> +
> +	if (!sync_page_io(log->rdev, r5l_block_to_sector(log, pos),
> +	    log->block_size, page, WRITE_FUA, false)) {
> +		__free_page(page);
> +		return -EIO;
> +	}
> +	__free_page(page);
> +	return 0;
> +}
> +
>  static int r5l_recovery_log(struct r5l_log *log)
>  {
> -	/* fake recovery */
> -	log->seq = log->last_cp_seq + 1;
> -	log->log_start = r5l_ring_add(log, log->last_checkpoint, 1);
> +	struct r5l_recovery_ctx ctx;
> +
> +	ctx.pos = log->last_checkpoint;
> +	ctx.seq = log->last_cp_seq;
> +	ctx.meta_page = alloc_page(GFP_KERNEL);
> +	if (!ctx.meta_page)
> +		return -ENOMEM;
> +
> +	r5l_recovery_flush_log(log, &ctx);
> +	__free_page(ctx.meta_page);
> +
> +	/*
> +	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
> +	 * log will start here. but we can't let superblock point to last valid
> +	 * meta block. The log might looks like:
> +	 * | meta 1| meta 2| meta 3|
> +	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
> +	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
> +	 * happens again, new recovery will start from meta 1. Since meta 2n is
> +	 * valid now, recovery will think meta 3 is valid, which is wrong.
> +	 * The solution is we create a new meta in meta2 with its seq == meta
> +	 * 1's seq + 2 and let superblock points to meta2. The same recovery will
> +	 * not think meta 3 is a valid meta, because its seq doesn't match
> +	 */
> +	if (ctx.seq > log->last_cp_seq + 1) {
> +		int ret;
> +		r5l_flush_all_disks(log);
> +
> +		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 1);
> +		if (ret)
> +			return ret;
> +		log->seq = ctx.seq + 2;
> +		log->log_start = r5l_ring_add(log, ctx.pos, 1);
> +		r5l_write_super(log, r5l_block_to_sector(log, ctx.pos));
> +	}
>  	return 0;
>  }
>  
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index d1ddd31..77af7f0 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -662,7 +662,7 @@ static int has_failed(struct r5conf *conf)
>  	return 0;
>  }
>  
> -static struct stripe_head *
> +struct stripe_head *
>  get_active_stripe(struct r5conf *conf, sector_t sector,
>  		  int previous, int noblock, int noquiesce)
>  {
> @@ -2527,7 +2527,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
>   * Input: a 'big' sector number,
>   * Output: index of the data and parity disk, and the sector # in them.
>   */
> -static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
> +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
>  				     int previous, int *dd_idx,
>  				     struct stripe_head *sh)
>  {
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 23cc9c3..fd10d29 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -618,4 +618,10 @@ extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
>  extern void r5l_write_stripe_run(struct r5l_log *log);
>  extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
>  extern void r5l_stripe_write_finished(struct stripe_head *sh);
> +extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
> +				     int previous, int *dd_idx,
> +				     struct stripe_head *sh);
> +extern struct stripe_head *
> +get_active_stripe(struct r5conf *conf, sector_t sector,
> +		  int previous, int noblock, int noquiesce);
>  #endif


Thanks,
NeilBrown


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support
  2015-07-30  0:38 ` [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support Shaohua Li
@ 2015-08-05  4:13   ` NeilBrown
  2015-08-05 21:42     ` Shaohua Li
  0 siblings, 1 reply; 30+ messages in thread
From: NeilBrown @ 2015-08-05  4:13 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:47 -0700 Shaohua Li <shli@fb.com> wrote:

> If cache(log) support is enabled, don't allow resize/reshape in current
> stage. In the future, we can flush all data from cache(log) to raid
> before resize/reshape and then allow resize/reshape.

Just to be on the safe side, you could probably add code to refuse to
start an array that is in the middle of a reshape and also have a log
configured.

I think it makes sense to plan ahead a little and make sure we can
handle a cache on a reshaping array properly.

If the log metadata block includes a before/after flag for each stripe,
which recorded whether the stripe was "before" or "after"
reshape_position when it was written, then when recovering the log we
can check if the given addresses are still on that side.  If they are,
just recover using the appropriate geometry info from the superblock.
If not, then reshape has passed over that stripe and it must now be
fully up-to-date on the RAID so the data in the log can be discarded.

There may be some details I missed, but I think it is worth thinking
through properly.  I don't expect the code to handle this straight
away, but we need a clear plan to be sure there is sufficient
information stored in the log.

Thanks,
NeilBrown

> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/raid5.c | 6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 59f9312..b694d06 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7184,6 +7184,10 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
>  	 * worth it.
>  	 */
>  	sector_t newsize;
> +	struct r5conf *conf = mddev->private;
> +
> +	if (conf->log)
> +		return -EINVAL;
>  	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
>  	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
>  	if (mddev->external_size &&
> @@ -7235,6 +7239,8 @@ static int check_reshape(struct mddev *mddev)
>  {
>  	struct r5conf *conf = mddev->private;
>  
> +	if (conf->log)
> +		return -EINVAL;
>  	if (mddev->delta_disks == 0 &&
>  	    mddev->new_layout == mddev->layout &&
>  	    mddev->new_chunk_sectors == mddev->chunk_sectors)


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 9/9] raid5: skip resync if cache(log) is enabled
  2015-07-30  0:38 ` [PATCH 9/9] raid5: skip resync if cache(log) is enabled Shaohua Li
@ 2015-08-05  4:16   ` NeilBrown
  0 siblings, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-05  4:16 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 29 Jul 2015 17:38:49 -0700 Shaohua Li <shli@fb.com> wrote:

> If cache(log) is enabled, the log structure will guarantee data
> consistency, so skip resync for unclean shutdown
> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/raid5.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 26ea100..330550a 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6967,6 +6967,15 @@ static int run(struct mddev *mddev)
>  		}
>  	}
>  
> +	if (conf->log) {
> +		if (mddev->recovery_cp == 0) {
> +			printk(KERN_NOTICE
> +				"md/raid:%s: skip resync with caching enabled\n",
> +				mdname(mddev));
> +			mddev->recovery_cp = MaxSector;
> +		}
> +	}
> +
>  	return 0;
>  abort:
>  	md_unregister_thread(&mddev->thread);

I don't think this is correct.  When a RAID6 is started, recovery_cp
will be zero and we really do want a resync to happen even if there is
a log.

Rather, I think that recovery_cp should not be set to zero when a log
is being used.  So keep the array appearing to be "clean".
If recovery_cp ever does get set to zero (mdadm --assemble
--update=resync can do this), then do the resync.

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/9] raid5: add basic stripe log
  2015-08-05  3:07   ` NeilBrown
@ 2015-08-05 21:19     ` Shaohua Li
  2015-08-12  3:20       ` NeilBrown
  0 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-08-05 21:19 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, Aug 05, 2015 at 01:07:36PM +1000, NeilBrown wrote:
> On Wed, 29 Jul 2015 17:38:43 -0700 Shaohua Li <shli@fb.com> wrote:
> 
> > This introduces a simple log for raid5. Data/parity writting to raid
> > array first writes to the log, then write to raid array disks. If crash
> > happens, we can recovery data from the log. This can speed up raid
> > resync and fix write hole issue.
> > 
> > The log structure is pretty simple. Data/meta data is stored in block
> > unit, which is 4k generally. It has only one type of meta data block.
> > The meta data block can track 3 types of data, stripe data, stripe
> > parity and flush block. MD superblock will point to the last valid meta
> > data block. Each meta data block has checksum/seq number, so recovery
> > can scan the log correctly. We store a checksum of stripe data/parity to
> > the metadata block, so meta data and stripe data/parity can be written
> > to log disk together. otherwise, meta data write must wait till stripe
> > data/parity is finished.
> > 
> > For stripe data, meta data block will record stripe data sector and
> > size. Currently the size is always 4k. This meta data record can be made
> > simpler if we just fix write hole (eg, we can record data of a stripe's
> > different disks together), but this format can be extended to support
> > caching in the future, which must record data address/size.
> > 
> > For stripe parity, meta data block will record stripe sector. It's size
> > should be 4k (for raid5) or 8k (for raid6). We always store p parity
> > first. This format should work for caching too.
> 
> It feels a bit odd have a 8K parity block for RAID6 as it is really two
> blocks: a parity block and a Q-syndrome block.  What would you think of
> introducing another block type for Q?  So there are Data blocks, Parity
> blocks, and Q blocks ???
> 
> Not a big issue, but I thought I would mention it.

I'd prefer not adding the complexity, it's just a naming.

> > 
> > flush block indicates a stripe is in raid array disks. Fixing write hole
> > doesn't need this type of meta data, it's for caching extention.
> > 
> > We should be careful about deadlock. Appending stripe data/parity to log
> > is done in raid5d. The append need log space, which can trigger log
> > reclaim, which might wait for raid5d to run (to flush data from log to
> > raid array disks). Here we always do the log write in a separate thread.
> 
> I'm not convinced about the need for a separate thread.  As
> raid5d/handle_stripe works as a state machine, and as all IO is async,
> we should at most need an extra state, not an extra thread.
> 
> I think a key issue is that you don't call r5l_get_reserve() until you
> are about to submit writes to the log.
> If instead you reserved the space in r5l_write_stripe and delay the
> stripe if space is not available, there there would be no deadlock.
> Then when available space crosses some threshold, re-activate those
> delayed stripes.

ok, that way works too. As I said before, I really hate making the
log/cache stuff tie tightly together with stripe cache state machine if
possible (eg, adding new state/list/stripe analysis etc). I'd rather to
keep current logic if you don't strongly object.

> > +#include <linux/kernel.h>
> > +#include <linux/wait.h>
> > +#include <linux/blkdev.h>
> > +#include <linux/slab.h>
> > +#include <linux/raid/md_p.h>
> > +#include <linux/crc32.h>
> > +#include <linux/random.h>
> > +#include "md.h"
> > +#include "raid5.h"
> > +
> > +typedef u64 r5blk_t; /* log blocks, 512B - 4k */
> 
> I would much rather use sector_t throughout and keep all addresses as
> sector addresses.  Much less room for confusion that way.
> 
> If we wanted to pack lots of addresses into limited space then using
> block addresses might be justified (filesystems do that), but I don't
> think it is called for here at all.

The original idea is metadata block size could be changed between 512B
to 4k. Sometimes the metadata block can't have enough data if we don't
want to delay IO. When the block size isn't a sector, the type helps me
avoid coding error (eg, remind it's a block instead of a sector). Do you
prefer metadata block size one sector?

> > +static void r5l_put_reserve(struct r5l_log *log, unsigned int reserved_blocks)
> > +{
> > +	BUG_ON(!mutex_is_locked(&log->io_mutex));
> > +
> > +	log->reserved_blocks -= reserved_blocks;
> > +	if (r5l_free_space(log) > 0)
> > +		wake_up(&log->space_waitq);
> > +}
> > +
> > +static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
> > +{
> > +	struct r5l_io_unit *io;
> > +	gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
> > +
> > +	io = kmem_cache_zalloc(log->io_kc, gfp);
> > +	if (!io)
> > +		return NULL;
> > +	io->log = log;
> > +	io->meta_page = alloc_page(gfp | __GFP_ZERO);
> > +	if (!io->meta_page) {
> > +		kmem_cache_free(log->io_kc, io);
> > +		return NULL;
> > +	}
> 
> This can return NULL, but the one place where you call it you do not
> check for NULL.
> Maybe a mempool would be appropriate, maybe something else - I haven't
> examine the issue closely.

I use mempool for the cache patch, but eventually choose to use NOFAIL
allocation here, because don't know what the minimal mempool element
size should be. incorrect mempool element size could introduce trouble,
eg, the allocation expects we free an element, which might not possible
without increasing complexity in reclaim. Maybe I should just ignore the
NULL, it's a NOFAIL allocation. The log code isn't ready with allocation
failure.
 
> > +static int r5l_recovery_log(struct r5l_log *log)
> > +{
> > +	/* fake recovery */
> > +	log->seq = log->last_cp_seq + 1;
> > +	log->log_start = r5l_ring_add(log, log->last_checkpoint, 1);
> > +	return 0;
> > +}
> > +
> > +static void r5l_write_super(struct r5l_log *log, sector_t cp)
> > +{
> > +	log->rdev->recovery_offset = cp;
> > +	md_update_sb(log->mddev, 1);
> > +}
> 
> This is only called from run() when the log is first initialised.  At
> this point there is nothing useful in the log, so recording it's
> location is pointless.  At most you  could set the MD_SB_DIRTY flag (or
> whatever it is).
> So it really doesn't need to be a separate function.

We must write super here. If we start append data to the log and super
doesn't point to correct postion (reclaim might not run once yet),
recovery doesn't know where to find the log.

> > +void r5l_exit_log(struct r5l_log *log)
> > +{
> > +	md_unregister_thread(&log->log_thread);
> > +
> > +	kmem_cache_destroy(log->io_kc);
> > +	kfree(log);
> > +}
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 59e44e9..9608a44 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -899,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
> >  
> >  	might_sleep();
> >  
> > +	if (!r5l_write_stripe(conf->log, sh))
> > +		return;
> 
> If no log is configured, r5l_write_stripe will return -EAGAIN, and so
> ops_run_io will never submit any IO....

I think you read it wrong, we don't return in that case.

> > diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> > index 02c3bf8..a8daf39 100644
> > --- a/drivers/md/raid5.h
> > +++ b/drivers/md/raid5.h
> > @@ -223,6 +223,9 @@ struct stripe_head {
> >  	struct stripe_head	*batch_head; /* protected by stripe lock */
> >  	spinlock_t		batch_lock; /* only header's lock is useful */
> >  	struct list_head	batch_list; /* protected by head's batch lock*/
> > +
> > +	struct r5l_io_unit	*log_io;
> > +	struct list_head	log_list;
> >  	/**
> >  	 * struct stripe_operations
> >  	 * @target - STRIPE_OP_COMPUTE_BLK target
> 
> I wonder if we really need yet another 'list_head' in 'stripe_head'.
> I guess one more is no great cost.

I'm pretty sure using the lru list of stripe_head is broken, that's why
I added the new list. 

I'll fix other issues or add more comments.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 4/9] raid5: log reclaim support
  2015-08-05  3:43   ` NeilBrown
@ 2015-08-05 21:34     ` Shaohua Li
  2015-08-12  3:50       ` NeilBrown
  0 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-08-05 21:34 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, Aug 05, 2015 at 01:43:30PM +1000, NeilBrown wrote:
> On Wed, 29 Jul 2015 17:38:44 -0700 Shaohua Li <shli@fb.com> wrote:
> 
> > This is the reclaim support for raid5 log. A stripe write will have
> > following steps:
> > 
> > 1. reconstruct the stripe, read data/calculate parity. ops_run_io
> > prepares to write data/parity to raid disks
> > 2. hijack ops_run_io. stripe data/parity is appending to log disk
> > 3. flush log disk cache
> > 4. ops_run_io run again and do normal operation. stripe data/parity is
> > written in raid array disks. raid core can return io to upper layer.
> > 5. flush cache of all raid array disks
> > 6. update super block
> > 7. log disk space used by the stripe can be reused
> > 
> > In practice, several stripes consist of an io_unit and we will batch
> > several io_unit in different steps, but the whole process doesn't
> > change.
> > 
> > It's possible io return just after data/parity hit log disk, but then
> > read IO will need read from log disk. For simplicity, IO return happens
> > at step 4, where read IO can directly read from raid disks.
> > 
> > Currently reclaim run every minute or out of space. Reclaim is just to
> > free log disk spaces, it doesn't impact data consistency.
> 
> Having arbitrary times lines "every minute" is a warning sign.
> "As soon as possible" and "Just it time" can both make sense easily.
> "every minute" needs more justification.
> 
> I'll probably say more when I find the code.

The idea is if we reclaim periodically, recovery could scan less log
space. It's insane recovery scans a 1T disk. As I said this is just to
free disk spaces. It's not a signal we will lose data in minute
interval. I can change the relaim to run every 1G reclaimable space for
example.

> > +	r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
> > +			IO_UNIT_IO_END);
> > +	r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
> > +			IO_UNIT_STRIPE_END);
> > +	r5l_compress_stripe_end_list(log);
> > +	run_stripe = !list_empty(&log->io_end_ios);
> > +	spin_unlock(&log->io_list_lock);
> > +
> > +	if (!run_stripe)
> > +		return;
> > +
> > +	blkdev_issue_flush(r5l_bdev(log), GFP_NOIO, NULL);
> > +
> > +	spin_lock(&log->io_list_lock);
> > +	list_for_each_entry(io, &log->io_end_ios, log_sibling) {
> > +		if (io->state >= IO_UNIT_STRIPE_START)
> > +			continue;
> > +		r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START);
> > +
> > +		while (!list_empty(&io->stripe_list)) {
> > +			sh = list_first_entry(&io->stripe_list,
> > +				struct stripe_head, log_list);
> > +			list_del_init(&sh->log_list);
> > +			set_bit(STRIPE_HANDLE, &sh->state);
> > +			release_stripe(sh);
> 
> This code makes me a bit nervous.  handle_stripe() can potentially be
> called on any stripe at any time.
> Here you are scheduling a call the handle_stripe() without obviously
> changing the state of the stripe.  So whatever is going to happen now
> could potentially have happened before... is that safe?

I'm not fully sure, but it's ok in my test.
 
> > +	 * move proper io_unit to reclaim list. We should not change the order.
> > +	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
> > +	 * shouldn't reuse space of an unreclaimable io_unit
> > +	 * */
> > +	while (1) {
> > +		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
> > +			IO_UNIT_IO_END);
> > +		r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
> > +				IO_UNIT_STRIPE_END);
> > +		while (!list_empty(&log->stripe_end_ios)) {
> > +			io = list_first_entry(&log->stripe_end_ios,
> > +				struct r5l_io_unit, log_sibling);
> > +			list_move_tail(&io->log_sibling, &list);
> > +			free += (io->log_end - io->log_start +
> > +				log->total_blocks) % log->total_blocks;
> > +		}
> > +
> > +		if (free >= reclaim_target || (list_empty(&log->running_ios) &&
> > +		    list_empty(&log->io_end_ios) &&
> > +		    list_empty(&log->stripe_end_ios)))
> > +			break;
> > +
> > +		if (!list_empty(&log->io_end_ios)) {
> > +			io = list_first_entry(&log->io_end_ios,
> > +				struct r5l_io_unit, log_sibling);
> > +			spin_unlock(&log->io_list_lock);
> > +			/* nobody else can delete the io, we are safe */
> > +			r5l_kick_io_unit(log, io);
> > +			spin_lock(&log->io_list_lock);
> > +			continue;
> > +		}
> > +
> > +		if (!list_empty(&log->running_ios)) {
> > +			io = list_first_entry(&log->running_ios,
> > +				struct r5l_io_unit, log_sibling);
> > +			spin_unlock(&log->io_list_lock);
> > +			/* nobody else can delete the io, we are safe */
> > +			r5l_kick_io_unit(log, io);
> > +			spin_lock(&log->io_list_lock);
> > +			continue;
> > +		}
> > +	}
> > +	spin_unlock(&log->io_list_lock);
> 
> Well, here we are with the important parts of the reclaim code...
> 
> The main result of the above section is to possibly call
> r5l_flush_stripe_to_raid() a few times, and to wait until 'list'
> contains enough io_units to satisfy the requirement.
> 
> As raid5d already calls r5l_flush_stripe_to_raid - which it really must
> to make sure that writes complete quickly - this really comes down to
> some book keeping and some waiting.
> Book keeping can be done as changes happen, and waiting is best not
> done at all.
> 
> To be more specific: when an io_unit transitions to IO_UNIT_STRIPE_END
> it can immediately be removed from the list and if it was the first
> io_unit on the list, then the log_start can immediately be updated.

Ok, the original idea is to avoid holding the io_list_lock for every IO
end/stripe end. Maybe over-designed, I'll fix this.

> > +
> > +	if (list_empty(&list))
> > +		return;
> > +
> > +	r5l_flush_all_disks(log);
> > +
> > +	/* super always point to last valid meta */
> > +	last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
> > +	r5l_write_super(log, r5l_block_to_sector(log, last->log_start));
> 
> This bit flushes all the disks and then updates the metadata and writes
> it.
> As md_super_write already uses WRITE_FLUSH_FUA I don't think the extra
> flush is needed.
great.

> I really think you should just update ->recovery_offset and set
> MD_CHANGE_DEVS (or similar) and let the update happen.

I think we should write super here. The reclaimed space might be reused
immediately, we don't want to confuse recovery.

> > +
> > +	mutex_lock(&log->io_mutex);
> > +	log->last_checkpoint = last->log_start;
> > +	log->last_cp_seq = last->seq;
> > +	mutex_unlock(&log->io_mutex);
> > +	wake_up(&log->space_waitq);
> > +
> > +	while (!list_empty(&list)) {
> > +		io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
> > +		list_del(&io->log_sibling);
> > +		r5l_free_io_unit(log, io);
> > +	}
> > +}
> 
> So I really think all of this can be done as-it-happens (the
> book-keeping) or asynchronously.  There is no need to push something
> every minute.

Because we need flush raid disks cache, to avoid the overhead, we do
batch operation. Once I changed the reclaim to run every specific
reclaimable space, this should be ok.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/9] raid5: log recovery
  2015-08-05  4:05   ` NeilBrown
@ 2015-08-05 21:39     ` Shaohua Li
  2015-08-12  3:51       ` NeilBrown
  0 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-08-05 21:39 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, Aug 05, 2015 at 02:05:25PM +1000, NeilBrown wrote:
> On Wed, 29 Jul 2015 17:38:45 -0700 Shaohua Li <shli@fb.com> wrote:
> 
> > This is the log recovery support. The process is quite straightforward.
> > We scan the log and read all valid meta/data/parity into memory. If a
> > stripe's data/parity checksum is correct, the stripe will be recoveried.
> > Otherwise, it's discarded and we don't scan the log further. The reclaim
> > process guarantees stripe which starts to be flushed raid disks has
> > completed data/parity and has correct checksum. To recovery a stripe, we
> > just copy its data/parity to corresponding raid disks.
> > 
> > The trick thing is superblock update after recovery. we can't let
> > superblock point to last valid meta block. The log might look like:
> > | meta 1| meta 2| meta 3|
> > meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If superblock
> > points to meta 1, we write a new valid meta 2n.  If crash happens again,
> > new recovery will start from meta 1. Since meta 2n is valid, recovery
> > will think meta 3 is valid, which is wrong.  The solution is we create a
> > new meta in meta2 with its seq == meta 1's seq + 2 and let superblock
> > points to meta2.  recovery will not think meta 3 is a valid meta,
> > because its seq is wrong
> 
> I like the idea of using a slightly larger 'seq' to avoid collisions -
> except that I would probably feel safer with a much larger seq. May add
> 1024 or something (at least 10).

ok 
> > 
> > TODO:
> > -recovery should run the stripe cache state machine in case of disk
> > breakage.
> 
> Why?
> 
> when you write to the log, you write all of the blocks that need
> updating, whether they are destined for a failed device or not.
> 
> When you recover, you then have all the blocks that you might want to
> write.  So write all the ones for which you have working devices, and
> ignore the rest.
> 
> Did I miss something?
> 
> Not that I object, but if it works....

I mean the case of disk is broken. For example, log has a stripe with
data for disk 1, 2, 4. In recovery, disk 2 is broken. Just write 1, 4
isn't good. If we run the state machine, we can read disk 3 and have an
eventually consistent stripe.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support
  2015-08-05  4:13   ` NeilBrown
@ 2015-08-05 21:42     ` Shaohua Li
  2015-08-12  3:57       ` NeilBrown
  0 siblings, 1 reply; 30+ messages in thread
From: Shaohua Li @ 2015-08-05 21:42 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, Aug 05, 2015 at 02:13:51PM +1000, NeilBrown wrote:
> On Wed, 29 Jul 2015 17:38:47 -0700 Shaohua Li <shli@fb.com> wrote:
> 
> > If cache(log) support is enabled, don't allow resize/reshape in current
> > stage. In the future, we can flush all data from cache(log) to raid
> > before resize/reshape and then allow resize/reshape.
> 
> Just to be on the safe side, you could probably add code to refuse to
> start an array that is in the middle of a reshape and also have a log
> configured.
ok
 
> I think it makes sense to plan ahead a little and make sure we can
> handle a cache on a reshaping array properly.
> 
> If the log metadata block includes a before/after flag for each stripe,
> which recorded whether the stripe was "before" or "after"
> reshape_position when it was written, then when recovering the log we
> can check if the given addresses are still on that side.  If they are,
> just recover using the appropriate geometry info from the superblock.
> If not, then reshape has passed over that stripe and it must now be
> fully up-to-date on the RAID so the data in the log can be discarded.
> 
> There may be some details I missed, but I think it is worth thinking
> through properly.  I don't expect the code to handle this straight
> away, but we need a clear plan to be sure there is sufficient
> information stored in the log.

Just adding a flag is enough? Sounds there is no way to avoid the write
hole issue if the array is reshapping. The data stored in log is valid,
but if a reshape runs, we can't guarantee the parity is valid.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 3/9] raid5: add basic stripe log
  2015-08-05 21:19     ` Shaohua Li
@ 2015-08-12  3:20       ` NeilBrown
  0 siblings, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-12  3:20 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 5 Aug 2015 14:19:20 -0700 Shaohua Li <shli@fb.com> wrote:

> On Wed, Aug 05, 2015 at 01:07:36PM +1000, NeilBrown wrote:
> > On Wed, 29 Jul 2015 17:38:43 -0700 Shaohua Li <shli@fb.com> wrote:
> > 
> > > This introduces a simple log for raid5. Data/parity writting to raid
> > > array first writes to the log, then write to raid array disks. If crash
> > > happens, we can recovery data from the log. This can speed up raid
> > > resync and fix write hole issue.
> > > 
> > > The log structure is pretty simple. Data/meta data is stored in block
> > > unit, which is 4k generally. It has only one type of meta data block.
> > > The meta data block can track 3 types of data, stripe data, stripe
> > > parity and flush block. MD superblock will point to the last valid meta
> > > data block. Each meta data block has checksum/seq number, so recovery
> > > can scan the log correctly. We store a checksum of stripe data/parity to
> > > the metadata block, so meta data and stripe data/parity can be written
> > > to log disk together. otherwise, meta data write must wait till stripe
> > > data/parity is finished.
> > > 
> > > For stripe data, meta data block will record stripe data sector and
> > > size. Currently the size is always 4k. This meta data record can be made
> > > simpler if we just fix write hole (eg, we can record data of a stripe's
> > > different disks together), but this format can be extended to support
> > > caching in the future, which must record data address/size.
> > > 
> > > For stripe parity, meta data block will record stripe sector. It's size
> > > should be 4k (for raid5) or 8k (for raid6). We always store p parity
> > > first. This format should work for caching too.
> > 
> > It feels a bit odd have a 8K parity block for RAID6 as it is really two
> > blocks: a parity block and a Q-syndrome block.  What would you think of
> > introducing another block type for Q?  So there are Data blocks, Parity
> > blocks, and Q blocks ???
> > 
> > Not a big issue, but I thought I would mention it.
> 
> I'd prefer not adding the complexity, it's just a naming.

It's not really "just naming" - it affects the format of the metadata
block.
And I think your approach adds complexity.  It means the parity blocks
can be a different size to the data blocks, and it makes
r5l_log_pages() more complex than needed.



> 
> > > 
> > > flush block indicates a stripe is in raid array disks. Fixing write hole
> > > doesn't need this type of meta data, it's for caching extention.
> > > 
> > > We should be careful about deadlock. Appending stripe data/parity to log
> > > is done in raid5d. The append need log space, which can trigger log
> > > reclaim, which might wait for raid5d to run (to flush data from log to
> > > raid array disks). Here we always do the log write in a separate thread.
> > 
> > I'm not convinced about the need for a separate thread.  As
> > raid5d/handle_stripe works as a state machine, and as all IO is async,
> > we should at most need an extra state, not an extra thread.
> > 
> > I think a key issue is that you don't call r5l_get_reserve() until you
> > are about to submit writes to the log.
> > If instead you reserved the space in r5l_write_stripe and delay the
> > stripe if space is not available, there there would be no deadlock.
> > Then when available space crosses some threshold, re-activate those
> > delayed stripes.
> 
> ok, that way works too. As I said before, I really hate making the
> log/cache stuff tie tightly together with stripe cache state machine if
> possible (eg, adding new state/list/stripe analysis etc). I'd rather to
> keep current logic if you don't strongly object.

I think I do strongly object.

The stripe cache and state machine are the heart-and-soul of md/raid5.
If everything goes through the stripe cache then there are fewer
special cases to think about.

Having to create a separate thread to avoid a deadlock is a fairly
clear indicator that the design isn't very clean.


> 
> > > +#include <linux/kernel.h>
> > > +#include <linux/wait.h>
> > > +#include <linux/blkdev.h>
> > > +#include <linux/slab.h>
> > > +#include <linux/raid/md_p.h>
> > > +#include <linux/crc32.h>
> > > +#include <linux/random.h>
> > > +#include "md.h"
> > > +#include "raid5.h"
> > > +
> > > +typedef u64 r5blk_t; /* log blocks, 512B - 4k */
> > 
> > I would much rather use sector_t throughout and keep all addresses as
> > sector addresses.  Much less room for confusion that way.
> > 
> > If we wanted to pack lots of addresses into limited space then using
> > block addresses might be justified (filesystems do that), but I don't
> > think it is called for here at all.
> 
> The original idea is metadata block size could be changed between 512B
> to 4k. Sometimes the metadata block can't have enough data if we don't
> want to delay IO. When the block size isn't a sector, the type helps me
> avoid coding error (eg, remind it's a block instead of a sector). Do you
> prefer metadata block size one sector?

No, I don't prefer one-sector metadata blocks.  Given that there is a
growing amount of hardware that works best with 4K IO, I think a 4K
metadata block size is a good choice.

I certainly agree that having a separate type to store block numbers
makes sense as it could help avoid coding errors - except that I don't
think we should be storing block numbers at all.  EVery.

Just use sector numbers everywhere.

> 
> > > +static void r5l_put_reserve(struct r5l_log *log, unsigned int reserved_blocks)
> > > +{
> > > +	BUG_ON(!mutex_is_locked(&log->io_mutex));
> > > +
> > > +	log->reserved_blocks -= reserved_blocks;
> > > +	if (r5l_free_space(log) > 0)
> > > +		wake_up(&log->space_waitq);
> > > +}
> > > +
> > > +static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
> > > +{
> > > +	struct r5l_io_unit *io;
> > > +	gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
> > > +
> > > +	io = kmem_cache_zalloc(log->io_kc, gfp);
> > > +	if (!io)
> > > +		return NULL;
> > > +	io->log = log;
> > > +	io->meta_page = alloc_page(gfp | __GFP_ZERO);
> > > +	if (!io->meta_page) {
> > > +		kmem_cache_free(log->io_kc, io);
> > > +		return NULL;
> > > +	}
> > 
> > This can return NULL, but the one place where you call it you do not
> > check for NULL.
> > Maybe a mempool would be appropriate, maybe something else - I haven't
> > examine the issue closely.
> 
> I use mempool for the cache patch, but eventually choose to use NOFAIL
> allocation here, because don't know what the minimal mempool element
> size should be. incorrect mempool element size could introduce trouble,
> eg, the allocation expects we free an element, which might not possible
> without increasing complexity in reclaim. Maybe I should just ignore the
> NULL, it's a NOFAIL allocation. The log code isn't ready with allocation
> failure.

I hadn't noticed the __GFP_NOFAIL, only the "return NULL".
But I don't really like __GFP_NOFAIL, and there seem to be people in
the mm community who don't either, so I'd rather not rely on it.

2 entries in a mempool should be plenty.  The freeing of old entries
through reclaim should be completely independent of the allocation of
new entries, so that latter can safely wait for the former.


>  
> > > +static int r5l_recovery_log(struct r5l_log *log)
> > > +{
> > > +	/* fake recovery */
> > > +	log->seq = log->last_cp_seq + 1;
> > > +	log->log_start = r5l_ring_add(log, log->last_checkpoint, 1);
> > > +	return 0;
> > > +}
> > > +
> > > +static void r5l_write_super(struct r5l_log *log, sector_t cp)
> > > +{
> > > +	log->rdev->recovery_offset = cp;
> > > +	md_update_sb(log->mddev, 1);
> > > +}
> > 
> > This is only called from run() when the log is first initialised.  At
> > this point there is nothing useful in the log, so recording it's
> > location is pointless.  At most you  could set the MD_SB_DIRTY flag (or
> > whatever it is).
> > So it really doesn't need to be a separate function.
> 
> We must write super here. If we start append data to the log and super
> doesn't point to correct postion (reclaim might not run once yet),
> recovery doesn't know where to find the log.

Yes, we do need to store the address of the log in the superblock
before the first write.

We already have code to ensure the superblock is written on the first
write: md_write_start.
If mddev->in_sync is true when the array starts, then we are already
certain that the metadata will be updated before any write is sent.
Can you just make use of that?



> 
> > > +void r5l_exit_log(struct r5l_log *log)
> > > +{
> > > +	md_unregister_thread(&log->log_thread);
> > > +
> > > +	kmem_cache_destroy(log->io_kc);
> > > +	kfree(log);
> > > +}
> > > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > > index 59e44e9..9608a44 100644
> > > --- a/drivers/md/raid5.c
> > > +++ b/drivers/md/raid5.c
> > > @@ -899,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
> > >  
> > >  	might_sleep();
> > >  
> > > +	if (!r5l_write_stripe(conf->log, sh))
> > > +		return;
> > 
> > If no log is configured, r5l_write_stripe will return -EAGAIN, and so
> > ops_run_io will never submit any IO....
> 
> I think you read it wrong, we don't return in that case.

You are correct - sorry.
If often get confused when "!" is used like that.  It looks like it
says "If not r5l_write_stripe" - i.e. if we didn't write to the log.
I personally much prefer
   if (r5l_write_stripe(..) == 0)

just like I hate "if (!strcmp(x,y))" and prefer "if (strcmp(x,y) == 0)".
But maybe that it just me.



> 
> > > diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> > > index 02c3bf8..a8daf39 100644
> > > --- a/drivers/md/raid5.h
> > > +++ b/drivers/md/raid5.h
> > > @@ -223,6 +223,9 @@ struct stripe_head {
> > >  	struct stripe_head	*batch_head; /* protected by stripe lock */
> > >  	spinlock_t		batch_lock; /* only header's lock is useful */
> > >  	struct list_head	batch_list; /* protected by head's batch lock*/
> > > +
> > > +	struct r5l_io_unit	*log_io;
> > > +	struct list_head	log_list;
> > >  	/**
> > >  	 * struct stripe_operations
> > >  	 * @target - STRIPE_OP_COMPUTE_BLK target
> > 
> > I wonder if we really need yet another 'list_head' in 'stripe_head'.
> > I guess one more is no great cost.
> 
> I'm pretty sure using the lru list of stripe_head is broken, that's why
> I added the new list. 

You're probably right.

Thanks,
NeilBrown


> 
> I'll fix other issues or add more comments.
> 
> Thanks,
> Shaohua


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 4/9] raid5: log reclaim support
  2015-08-05 21:34     ` Shaohua Li
@ 2015-08-12  3:50       ` NeilBrown
  0 siblings, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-12  3:50 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 5 Aug 2015 14:34:21 -0700 Shaohua Li <shli@fb.com> wrote:

> On Wed, Aug 05, 2015 at 01:43:30PM +1000, NeilBrown wrote:
> > On Wed, 29 Jul 2015 17:38:44 -0700 Shaohua Li <shli@fb.com> wrote:
> > 
> > > This is the reclaim support for raid5 log. A stripe write will have
> > > following steps:
> > > 
> > > 1. reconstruct the stripe, read data/calculate parity. ops_run_io
> > > prepares to write data/parity to raid disks
> > > 2. hijack ops_run_io. stripe data/parity is appending to log disk
> > > 3. flush log disk cache
> > > 4. ops_run_io run again and do normal operation. stripe data/parity is
> > > written in raid array disks. raid core can return io to upper layer.
> > > 5. flush cache of all raid array disks
> > > 6. update super block
> > > 7. log disk space used by the stripe can be reused
> > > 
> > > In practice, several stripes consist of an io_unit and we will batch
> > > several io_unit in different steps, but the whole process doesn't
> > > change.
> > > 
> > > It's possible io return just after data/parity hit log disk, but then
> > > read IO will need read from log disk. For simplicity, IO return happens
> > > at step 4, where read IO can directly read from raid disks.
> > > 
> > > Currently reclaim run every minute or out of space. Reclaim is just to
> > > free log disk spaces, it doesn't impact data consistency.
> > 
> > Having arbitrary times lines "every minute" is a warning sign.
> > "As soon as possible" and "Just it time" can both make sense easily.
> > "every minute" needs more justification.
> > 
> > I'll probably say more when I find the code.
> 
> The idea is if we reclaim periodically, recovery could scan less log
> space. It's insane recovery scans a 1T disk. As I said this is just to
> free disk spaces. It's not a signal we will lose data in minute
> interval. I can change the relaim to run every 1G reclaimable space for
> example.

There seem to be two issues here and I might be confusing them.

Firstly there is the question of when a stripe gets written back to the
array.  Once the data is safe in the log this doesn't have to happen in
any great hurry, but I suspect it should still happen sooner rather
than later.

Presumably as soon as data/parity of a stripe is safe in the log,
that stripe will be scheduled to be written to the array - is that
correct?

As these writes-to-the-array complete the counter in the io_unit will
decrease.  when it reaches zero the io_unit can be freed and the
recovery_offset in the superblock can, potentially be updated.

Secondly there is the question of how often the superblock is updated.
As you say; delaying the updates indefinitely could lead to a recovery
having to examine a very large part of the log - maybe more than
necessary (though if that might be a problem, the simple solution is to
use a smaller log).

I would probably feel most comfortable scheduling a superblock update
whenever the amount of log space that it would reclaim exceeds 1/4 of
the log size.  That should be often enough without imposing a
completely arbitrary number.

Make sense?

thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 5/9] raid5: log recovery
  2015-08-05 21:39     ` Shaohua Li
@ 2015-08-12  3:51       ` NeilBrown
  0 siblings, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-12  3:51 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 5 Aug 2015 14:39:09 -0700 Shaohua Li <shli@fb.com> wrote:

> On Wed, Aug 05, 2015 at 02:05:25PM +1000, NeilBrown wrote:
> > On Wed, 29 Jul 2015 17:38:45 -0700 Shaohua Li <shli@fb.com> wrote:
> > 
> > > This is the log recovery support. The process is quite straightforward.
> > > We scan the log and read all valid meta/data/parity into memory. If a
> > > stripe's data/parity checksum is correct, the stripe will be recoveried.
> > > Otherwise, it's discarded and we don't scan the log further. The reclaim
> > > process guarantees stripe which starts to be flushed raid disks has
> > > completed data/parity and has correct checksum. To recovery a stripe, we
> > > just copy its data/parity to corresponding raid disks.
> > > 
> > > The trick thing is superblock update after recovery. we can't let
> > > superblock point to last valid meta block. The log might look like:
> > > | meta 1| meta 2| meta 3|
> > > meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If superblock
> > > points to meta 1, we write a new valid meta 2n.  If crash happens again,
> > > new recovery will start from meta 1. Since meta 2n is valid, recovery
> > > will think meta 3 is valid, which is wrong.  The solution is we create a
> > > new meta in meta2 with its seq == meta 1's seq + 2 and let superblock
> > > points to meta2.  recovery will not think meta 3 is a valid meta,
> > > because its seq is wrong
> > 
> > I like the idea of using a slightly larger 'seq' to avoid collisions -
> > except that I would probably feel safer with a much larger seq. May add
> > 1024 or something (at least 10).
> 
> ok 
> > > 
> > > TODO:
> > > -recovery should run the stripe cache state machine in case of disk
> > > breakage.
> > 
> > Why?
> > 
> > when you write to the log, you write all of the blocks that need
> > updating, whether they are destined for a failed device or not.
> > 
> > When you recover, you then have all the blocks that you might want to
> > write.  So write all the ones for which you have working devices, and
> > ignore the rest.
> > 
> > Did I miss something?
> > 
> > Not that I object, but if it works....
> 
> I mean the case of disk is broken. For example, log has a stripe with
> data for disk 1, 2, 4. In recovery, disk 2 is broken. Just write 1, 4
> isn't good. If we run the state machine, we can read disk 3 and have an
> eventually consistent stripe.

But the log will have date for disk 1, 2, 4, and P and Q.
So if disk 2 is broken, we just write 1, 4, P, and Q and the data is
safe.

NeilBrown


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support
  2015-08-05 21:42     ` Shaohua Li
@ 2015-08-12  3:57       ` NeilBrown
  0 siblings, 0 replies; 30+ messages in thread
From: NeilBrown @ 2015-08-12  3:57 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, Kernel-team, songliubraving, hch, dan.j.williams

On Wed, 5 Aug 2015 14:42:21 -0700 Shaohua Li <shli@fb.com> wrote:

> On Wed, Aug 05, 2015 at 02:13:51PM +1000, NeilBrown wrote:
> > On Wed, 29 Jul 2015 17:38:47 -0700 Shaohua Li <shli@fb.com> wrote:
> > 
> > > If cache(log) support is enabled, don't allow resize/reshape in current
> > > stage. In the future, we can flush all data from cache(log) to raid
> > > before resize/reshape and then allow resize/reshape.
> > 
> > Just to be on the safe side, you could probably add code to refuse to
> > start an array that is in the middle of a reshape and also have a log
> > configured.
> ok
>  
> > I think it makes sense to plan ahead a little and make sure we can
> > handle a cache on a reshaping array properly.
> > 
> > If the log metadata block includes a before/after flag for each stripe,
> > which recorded whether the stripe was "before" or "after"
> > reshape_position when it was written, then when recovering the log we
> > can check if the given addresses are still on that side.  If they are,
> > just recover using the appropriate geometry info from the superblock.
> > If not, then reshape has passed over that stripe and it must now be
> > fully up-to-date on the RAID so the data in the log can be discarded.
> > 
> > There may be some details I missed, but I think it is worth thinking
> > through properly.  I don't expect the code to handle this straight
> > away, but we need a clear plan to be sure there is sufficient
> > information stored in the log.
> 
> Just adding a flag is enough? Sounds there is no way to avoid the write
> hole issue if the array is reshapping. The data stored in log is valid,
> but if a reshape runs, we can't guarantee the parity is valid.

When a region of the array is reshaped, the current data is read (if
not already in the stripe cache), then written out to a location on disk
which is not currently in use, and then the superblock is updated to
record that the new area should be used and the old area is no longer
relevant.
So we have atomic updates from 'old' to 'new'.  There is no write-hole
issue for these writes.

So if a particular address in the array was in the 'old' section when
data was written to the log, and if on restart those addresses are in
the 'new' section, then we can be sure that the data, including parity,
was completely written to the new section - so the data in the log is
not needed.

So if we just add a before/after flag to entries in the log then we can
safe recovery around a reshape.
When reading the log on restart in some data is marked as being in the
'old' section but the address now maps to the 'new' section, then we
safely ignore that data.


Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2015-08-12  3:57 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-30  0:38 [PATCH 0/9]raid5: fix write hole Shaohua Li
2015-07-30  0:38 ` [PATCH 1/9] MD: add a new disk role to present cache device Shaohua Li
2015-08-04 14:28   ` Christoph Hellwig
2015-08-04 18:17     ` Song Liu
2015-08-05  0:25       ` NeilBrown
2015-08-05  1:05   ` NeilBrown
2015-07-30  0:38 ` [PATCH 2/9] md: override md superblock recovery_offset for " Shaohua Li
2015-08-04 14:30   ` Christoph Hellwig
2015-08-05  1:08   ` NeilBrown
2015-07-30  0:38 ` [PATCH 3/9] raid5: add basic stripe log Shaohua Li
2015-08-05  3:07   ` NeilBrown
2015-08-05 21:19     ` Shaohua Li
2015-08-12  3:20       ` NeilBrown
2015-07-30  0:38 ` [PATCH 4/9] raid5: log reclaim support Shaohua Li
2015-08-05  3:43   ` NeilBrown
2015-08-05 21:34     ` Shaohua Li
2015-08-12  3:50       ` NeilBrown
2015-08-05  3:52   ` NeilBrown
2015-07-30  0:38 ` [PATCH 5/9] raid5: log recovery Shaohua Li
2015-08-05  4:05   ` NeilBrown
2015-08-05 21:39     ` Shaohua Li
2015-08-12  3:51       ` NeilBrown
2015-07-30  0:38 ` [PATCH 6/9] raid5: disable batch with log enabled Shaohua Li
2015-07-30  0:38 ` [PATCH 7/9] raid5: don't allow resize/reshape with cache(log) support Shaohua Li
2015-08-05  4:13   ` NeilBrown
2015-08-05 21:42     ` Shaohua Li
2015-08-12  3:57       ` NeilBrown
2015-07-30  0:38 ` [PATCH 8/9] raid5: enable log for raid array with cache disk Shaohua Li
2015-07-30  0:38 ` [PATCH 9/9] raid5: skip resync if cache(log) is enabled Shaohua Li
2015-08-05  4:16   ` NeilBrown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.