linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] f2fs: support multiple devices
@ 2016-11-09 20:56 Jaegeuk Kim
  2016-11-09 22:57 ` Andreas Dilger
  0 siblings, 1 reply; 7+ messages in thread
From: Jaegeuk Kim @ 2016-11-09 20:56 UTC (permalink / raw)
  To: linux-kernel, linux-fsdevel, linux-f2fs-devel; +Cc: Jaegeuk Kim

This patch implements multiple devices support for f2fs.
Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big
volume under one f2fs instance.

Internal block management is very simple, but we will modify block allocation
and background GC policy to boost IO speed by exploiting them accoording to
each device speed.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          |  55 ++++++++++++++++---
 fs/f2fs/f2fs.h          |  29 ++++++++--
 fs/f2fs/segment.c       | 119 +++++++++++++++++++++++++++++------------
 fs/f2fs/super.c         | 138 ++++++++++++++++++++++++++++++++++++++----------
 include/linux/f2fs_fs.h |  10 +++-
 5 files changed, 277 insertions(+), 74 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 47ded0c..e2be24e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio)
 }
 
 /*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+				block_t blk_addr, struct bio *bio)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		if (FDEV(i).start_blk <= blk_addr &&
+					FDEV(i).end_blk >= blk_addr) {
+			blk_addr -= FDEV(i).start_blk;
+			bdev = FDEV(i).bdev;
+			break;
+		}
+	}
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+	}
+	return bdev;
+}
+
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
+			return i;
+	return 0;
+}
+
+static bool __same_bdev(struct f2fs_sb_info *sbi,
+				block_t blk_addr, struct bio *bio)
+{
+	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
+}
+
+/*
  * Low-level block read/write IO operations.
  */
 static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
@@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 
 	bio = f2fs_bio_alloc(npages);
 
-	bio->bi_bdev = sbi->sb->s_bdev;
-	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+	f2fs_target_device(sbi, blk_addr, bio);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
 	bio->bi_private = is_read ? NULL : sbi;
 
@@ -273,7 +312,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 	down_write(&io->io_rwsem);
 
 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
-	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
+	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
+			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
@@ -965,7 +1005,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct fscrypt_ctx *ctx = NULL;
-	struct block_device *bdev = sbi->sb->s_bdev;
 	struct bio *bio;
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
@@ -983,8 +1022,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
 			fscrypt_release_ctx(ctx);
 		return ERR_PTR(-ENOMEM);
 	}
-	bio->bi_bdev = bdev;
-	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+	f2fs_target_device(sbi, blkaddr, bio);
 	bio->bi_end_io = f2fs_read_end_io;
 	bio->bi_private = ctx;
 
@@ -1079,7 +1117,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
 		 * This page will go to BIO.  Do we need to send this
 		 * BIO off first?
 		 */
-		if (bio && (last_block_in_bio != block_nr - 1)) {
+		if (bio && (last_block_in_bio != block_nr - 1 ||
+			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
 submit_and_realloc:
 			__submit_bio(F2FS_I_SB(inode), bio, DATA);
 			bio = NULL;
@@ -1738,6 +1777,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		return 0;
 	if (test_opt(F2FS_I_SB(inode), LFS))
 		return 0;
+	if (F2FS_I_SB(inode)->s_ndevs)
+		return 0;
 
 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9650514..1737c45 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -730,6 +730,20 @@ struct f2fs_bio_info {
 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
 };
 
+#define FDEV(i)				(sbi->devs[i])
+#define RDEV(i)				(raw_super->devs[i])
+struct f2fs_dev_info {
+	struct block_device *bdev;
+	char path[MAX_PATH_LEN];
+	unsigned int total_segments;
+	block_t start_blk;
+	block_t end_blk;
+#ifdef CONFIG_BLK_DEV_ZONED
+	unsigned int nr_blkz;			/* Total number of zones */
+	u8 *blkz_type;				/* Array of zones type */
+#endif
+};
+
 enum inode_type {
 	DIR_INODE,			/* for dirty dir inode */
 	FILE_INODE,			/* for dirty regular/symlink inode */
@@ -778,10 +792,8 @@ struct f2fs_sb_info {
 #endif
 
 #ifdef CONFIG_BLK_DEV_ZONED
-	unsigned int nr_blkz;			/* Total number of zones */
 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
-	u8 *blkz_type;				/* Array of zones type */
 #endif
 
 	/* for node-related operations */
@@ -897,6 +909,8 @@ struct f2fs_sb_info {
 
 	/* For shrinker support */
 	struct list_head s_list;
+	int s_ndevs;				/* number of devices */
+	struct f2fs_dev_info *devs;		/* for device list */
 	struct mutex umount_mutex;
 	unsigned int shrinker_run_no;
 
@@ -2159,6 +2173,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
 void f2fs_flush_merged_bios(struct f2fs_sb_info *);
 int f2fs_submit_page_bio(struct f2fs_io_info *);
 void f2fs_submit_page_mbio(struct f2fs_io_info *);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *,
+				block_t, struct bio *);
+int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
 void set_data_blkaddr(struct dnode_of_data *);
 void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
 int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
@@ -2446,11 +2463,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
-				block_t blkaddr)
+			struct block_device *bdev, block_t blkaddr)
 {
 	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+	int i;
 
-	return sbi->blkz_type[zno];
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (FDEV(i).bdev == bdev)
+			return FDEV(i).blkz_type[zno];
+	return -EINVAL;
 }
 #endif
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7fb7dd3..ef727d1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -403,6 +403,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	}
 }
 
+static int __submit_flush_wait(struct block_device *bdev)
+{
+	struct bio *bio = f2fs_bio_alloc(0);
+	int ret;
+
+	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_bdev = bdev;
+	ret = submit_bio_wait(bio);
+	bio_put(bio);
+	return ret;
+}
+
+static int submit_flush_wait(struct f2fs_sb_info *sbi)
+{
+	int ret = __submit_flush_wait(sbi->sb->s_bdev);
+	int i;
+
+	if (sbi->s_ndevs && !ret) {
+		for (i = 1; i < sbi->s_ndevs; i++) {
+			ret = __submit_flush_wait(FDEV(i).bdev);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static int issue_flush_thread(void *data)
 {
 	struct f2fs_sb_info *sbi = data;
@@ -413,25 +440,18 @@ static int issue_flush_thread(void *data)
 		return 0;
 
 	if (!llist_empty(&fcc->issue_list)) {
-		struct bio *bio;
 		struct flush_cmd *cmd, *next;
 		int ret;
 
-		bio = f2fs_bio_alloc(0);
-
 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
-		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
-		ret = submit_bio_wait(bio);
-
+		ret = submit_flush_wait(sbi);
 		llist_for_each_entry_safe(cmd, next,
 					  fcc->dispatch_list, llnode) {
 			cmd->ret = ret;
 			complete(&cmd->wait);
 		}
-		bio_put(bio);
 		fcc->dispatch_list = NULL;
 	}
 
@@ -452,15 +472,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
-		struct bio *bio = f2fs_bio_alloc(0);
 		int ret;
 
 		atomic_inc(&fcc->submit_flush);
-		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
-		ret = submit_bio_wait(bio);
+		ret = submit_flush_wait(sbi);
 		atomic_dec(&fcc->submit_flush);
-		bio_put(bio);
 		return ret;
 	}
 
@@ -637,14 +653,18 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
 
 /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
 static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
-				block_t blkstart, block_t blklen)
+		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-	struct block_device *bdev = sbi->sb->s_bdev;
 	struct bio *bio = NULL;
 	int err;
 
 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
 
+	if (sbi->s_ndevs) {
+		int devi = f2fs_target_device_index(sbi, blkstart);
+
+		blkstart -= FDEV(devi).start_blk;
+	}
 	err = __blkdev_issue_discard(bdev,
 				SECTOR_FROM_BLOCK(blkstart),
 				SECTOR_FROM_BLOCK(blklen),
@@ -662,18 +682,24 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
-					block_t blkstart, block_t blklen)
+static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-	sector_t sector = SECTOR_FROM_BLOCK(blkstart);
 	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
-	struct block_device *bdev = sbi->sb->s_bdev;
+	sector_t sector;
+	int devi = 0;
 
-	if (nr_sects != bdev_zone_size(bdev)) {
+	if (sbi->s_ndevs) {
+		devi = f2fs_target_device_index(sbi, blkstart);
+		blkstart -= FDEV(devi).start_blk;
+	}
+	sector = SECTOR_FROM_BLOCK(blkstart);
+
+	if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) {
 		f2fs_msg(sbi->sb, KERN_INFO,
-			 "Unaligned discard attempted (sector %llu + %llu)",
-			 (unsigned long long)sector,
-			 (unsigned long long)nr_sects);
+			"(%d) %s: Unaligned discard attempted (block %x + %x)",
+			devi, sbi->s_ndevs ? FDEV(devi).path: "",
+			blkstart, blklen);
 		return -EIO;
 	}
 
@@ -682,14 +708,12 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	 * use regular discard if the drive supports it. For sequential
 	 * zones, reset the zone write pointer.
 	 */
-	switch (get_blkz_type(sbi, blkstart)) {
+	switch (get_blkz_type(sbi, bdev, blkstart)) {
 
 	case BLK_ZONE_TYPE_CONVENTIONAL:
 		if (!blk_queue_discard(bdev_get_queue(bdev)))
 			return 0;
-		return __f2fs_issue_discard_async(sbi, blkstart,
-						  blklen);
-
+		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
 		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
@@ -702,14 +726,45 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 }
 #endif
 
+static int __issue_discard_async(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
+#endif
+	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+}
+
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 				block_t blkstart, block_t blklen)
 {
+	sector_t start = blkstart, len = 0;
+	struct block_device *bdev;
 	struct seg_entry *se;
 	unsigned int offset;
 	block_t i;
+	int err = 0;
+
+	bdev = f2fs_target_device(sbi, blkstart, NULL);
+
+	for (i = blkstart; i < blkstart + blklen; i++, len++) {
+		if (i != start) {
+			struct block_device *bdev2 =
+				f2fs_target_device(sbi, i, NULL);
+
+			if (bdev2 != bdev) {
+				err = __issue_discard_async(sbi, bdev,
+						start, len);
+				if (err)
+					return err;
+				bdev = bdev2;
+				start = i;
+				len = 0;
+			}
+		}
 
-	for (i = blkstart; i < blkstart + blklen; i++) {
 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
 
@@ -717,11 +772,9 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 			sbi->discard_blks--;
 	}
 
-#ifdef CONFIG_BLK_DEV_ZONED
-	if (f2fs_sb_mounted_blkzoned(sbi->sb))
-		return f2fs_issue_discard_zone(sbi, blkstart, blklen);
-#endif
-	return __f2fs_issue_discard_async(sbi, blkstart, blklen);
+	if (len)
+		err = __issue_discard_async(sbi, bdev, start, len);
+	return err;
 }
 
 static void __add_discard_entry(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index eca9aea..4ccbb86 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
 	percpu_counter_destroy(&sbi->total_valid_inode_count);
 }
 
+static void destroy_device_list(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
+#ifdef CONFIG_BLK_DEV_ZONED
+		kfree(FDEV(i).blkz_type);
+#endif
+	}
+	kfree(sbi->devs);
+}
+
 static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb)
 		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->raw_super);
 
+	destroy_device_list(sbi);
+
 	destroy_percpu_info(sbi);
 	kfree(sbi);
 }
@@ -1516,9 +1531,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-static int init_blkz_info(struct f2fs_sb_info *sbi)
+static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 {
-	struct block_device *bdev = sbi->sb->s_bdev;
+	struct block_device *bdev = FDEV(devi).bdev;
 	sector_t nr_sectors = bdev->bd_part->nr_sects;
 	sector_t sector = 0;
 	struct blk_zone *zones;
@@ -1529,15 +1544,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
 	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
 		return 0;
 
+	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
+				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
+		return -EINVAL;
 	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
+	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
+				__ilog2_u32(sbi->blocks_per_blkz))
+		return -EINVAL;
 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
-	sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
-		sbi->log_blocks_per_blkz;
+	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+					sbi->log_blocks_per_blkz;
 	if (nr_sectors & (bdev_zone_size(bdev) - 1))
-		sbi->nr_blkz++;
+		FDEV(devi).nr_blkz++;
 
-	sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL);
-	if (!sbi->blkz_type)
+	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
+	if (!FDEV(devi).blkz_type)
 		return -ENOMEM;
 
 #define F2FS_REPORT_NR_ZONES   4096
@@ -1562,7 +1583,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
 		}
 
 		for (i = 0; i < nr_zones; i++) {
-			sbi->blkz_type[n] = zones[i].type;
+			FDEV(devi).blkz_type[n] = zones[i].type;
 			sector += zones[i].len;
 			n++;
 		}
@@ -1666,6 +1687,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 	return err;
 }
 
+static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	int i;
+
+	for (i = 0; i < MAX_DEVICES; i++) {
+		if (!RDEV(i).path[0])
+			return 0;
+
+		if (i == 0) {
+			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
+						MAX_DEVICES, GFP_KERNEL);
+			if (!sbi->devs)
+				return -ENOMEM;
+		}
+
+		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
+		if (i == 0) {
+			FDEV(i).start_blk = 0;
+			FDEV(i).end_blk = FDEV(i).start_blk +
+				(FDEV(i).total_segments <<
+				sbi->log_blocks_per_seg) - 1 +
+				le32_to_cpu(raw_super->segment0_blkaddr);
+		} else {
+			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+			FDEV(i).end_blk = FDEV(i).start_blk +
+				(FDEV(i).total_segments <<
+				sbi->log_blocks_per_seg) - 1;
+		}
+
+		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+					sbi->sb->s_mode, sbi->sb->s_type);
+		if (IS_ERR(FDEV(i).bdev))
+			return PTR_ERR(FDEV(i).bdev);
+
+		/* to release errored devices */
+		sbi->s_ndevs = i + 1;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
+				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Zoned block device feature not enabled\n");
+			return -EINVAL;
+		}
+		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+			if (init_blkz_info(sbi, i)) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Failed to initialize F2FS blkzone information");
+				return -EINVAL;
+			}
+			f2fs_msg(sbi->sb, KERN_INFO,
+				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+				i, FDEV(i).path,
+				FDEV(i).total_segments,
+				FDEV(i).start_blk, FDEV(i).end_blk,
+				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+				"Host-aware" : "Host-managed");
+			continue;
+		}
+#endif
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
+				i, FDEV(i).path,
+				FDEV(i).total_segments,
+				FDEV(i).start_blk, FDEV(i).end_blk);
+	}
+	return 0;
+}
+
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
@@ -1724,15 +1816,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 			 "Zoned block device support is not enabled\n");
 		goto free_sb_buf;
 	}
-#else
-	if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM &&
-	    !f2fs_sb_mounted_blkzoned(sb)) {
-		f2fs_msg(sb, KERN_ERR,
-			 "Zoned block device feature not enabled\n");
-		goto free_sb_buf;
-	}
 #endif
-
 	default_options(sbi);
 	/* parse mount options */
 	options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1802,6 +1886,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_meta_inode;
 	}
 
+	/* Initialize device list */
+	err = f2fs_scan_devices(sbi);
+	if (err) {
+		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+		goto free_devices;
+	}
+
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
 	percpu_counter_set(&sbi->total_valid_inode_count,
@@ -1820,15 +1911,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 
 	init_ino_entry_info(sbi);
 
-#ifdef CONFIG_BLK_DEV_ZONED
-	err = init_blkz_info(sbi);
-	if (err) {
-		f2fs_msg(sb, KERN_ERR,
-			"Failed to initialize F2FS blkzone information");
-		goto free_blkz;
-	}
-#endif
-
 	/* setup f2fs internal modules */
 	err = build_segment_manager(sbi);
 	if (err) {
@@ -2007,10 +2089,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	destroy_node_manager(sbi);
 free_sm:
 	destroy_segment_manager(sbi);
-#ifdef CONFIG_BLK_DEV_ZONED
-free_blkz:
-	kfree(sbi->blkz_type);
-#endif
+free_devices:
+	destroy_device_list(sbi);
 	kfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 422630b..cea41a1 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -52,10 +52,17 @@
 
 #define VERSION_LEN	256
 #define MAX_VOLUME_NAME		512
+#define MAX_PATH_LEN		64
+#define MAX_DEVICES		8
 
 /*
  * For superblock
  */
+struct f2fs_device {
+	__u8 path[MAX_PATH_LEN];
+	__le32 total_segments;
+} __packed;
+
 struct f2fs_super_block {
 	__le32 magic;			/* Magic Number */
 	__le16 major_ver;		/* Major Version */
@@ -94,7 +101,8 @@ struct f2fs_super_block {
 	__le32 feature;			/* defined features */
 	__u8 encryption_level;		/* versioning level for encryption */
 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
-	__u8 reserved[871];		/* valid reserved region */
+	struct f2fs_device devs[MAX_DEVICES];	/* device list */
+	__u8 reserved[327];		/* valid reserved region */
 } __packed;
 
 /*
-- 
2.8.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] f2fs: support multiple devices
  2016-11-09 20:56 [PATCH] f2fs: support multiple devices Jaegeuk Kim
@ 2016-11-09 22:57 ` Andreas Dilger
  2016-11-09 23:05   ` Darrick J. Wong
                     ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Andreas Dilger @ 2016-11-09 22:57 UTC (permalink / raw)
  To: Jaegeuk Kim
  Cc: LKML, Lustre Development, linux-fsdevel, linux-f2fs-devel, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 23244 bytes --]

On Nov 9, 2016, at 1:56 PM, Jaegeuk Kim <jaegeuk@kernel.org> wrote:
> 
> This patch implements multiple devices support for f2fs.
> Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big
> volume under one f2fs instance.
> 
> Internal block management is very simple, but we will modify block
> allocation and background GC policy to boost IO speed by exploiting them
> accoording to each device speed.

How will you integrate this into FIEMAP, since it is now possible if a
file is split across multiple devices then it will return ambiguous block
numbers for a file.  I've been meaning to merge the FIEMAP handling in
Lustre to support multiple devices in a single filesystem, so that this
can be detected in userspace.

struct ll_fiemap_extent {
        __u64 fe_logical;  /* logical offset in bytes for the start of
                            * the extent from the beginning of the file
                            */
        __u64 fe_physical; /* physical offset in bytes for the start
                            * of the extent from the beginning of the disk
                            */
        __u64 fe_length;   /* length in bytes for this extent */
        __u64 fe_reserved64[2];
        __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
        __u32 fe_device;   /* device number for this extent */
        __u32 fe_reserved[2];
};

This adds the 32-bit "fe_device" field, which would optionally be filled
in by the filesystem (zero otherwise).  It would return the kernel device
number (i.e. st_dev), or for network filesystem (with FIEMAP_EXTENT_NET
set) this could just return an integer device number since the device
number is meaningless (and may conflict) on a remote system.

Since AFAIK Btrfs also has multiple device support there are an increasing
number of places where this would be useful.

Cheers, Andreas

> 
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
> fs/f2fs/data.c          |  55 ++++++++++++++++---
> fs/f2fs/f2fs.h          |  29 ++++++++--
> fs/f2fs/segment.c       | 119 +++++++++++++++++++++++++++++------------
> fs/f2fs/super.c         | 138 ++++++++++++++++++++++++++++++++++++++----------
> include/linux/f2fs_fs.h |  10 +++-
> 5 files changed, 277 insertions(+), 74 deletions(-)
> 
> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> index 47ded0c..e2be24e 100644
> --- a/fs/f2fs/data.c
> +++ b/fs/f2fs/data.c
> @@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio)
> }
> 
> /*
> + * Return true, if pre_bio's bdev is same as its target device.
> + */
> +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
> +				block_t blk_addr, struct bio *bio)
> +{
> +	struct block_device *bdev = sbi->sb->s_bdev;
> +	int i;
> +
> +	for (i = 0; i < sbi->s_ndevs; i++) {
> +		if (FDEV(i).start_blk <= blk_addr &&
> +					FDEV(i).end_blk >= blk_addr) {
> +			blk_addr -= FDEV(i).start_blk;
> +			bdev = FDEV(i).bdev;
> +			break;
> +		}
> +	}
> +	if (bio) {
> +		bio->bi_bdev = bdev;
> +		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
> +	}
> +	return bdev;
> +}
> +
> +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
> +{
> +	int i;
> +
> +	for (i = 0; i < sbi->s_ndevs; i++)
> +		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
> +			return i;
> +	return 0;
> +}
> +
> +static bool __same_bdev(struct f2fs_sb_info *sbi,
> +				block_t blk_addr, struct bio *bio)
> +{
> +	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
> +}
> +
> +/*
>  * Low-level block read/write IO operations.
>  */
> static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
> @@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
> 
> 	bio = f2fs_bio_alloc(npages);
> 
> -	bio->bi_bdev = sbi->sb->s_bdev;
> -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
> +	f2fs_target_device(sbi, blk_addr, bio);
> 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
> 	bio->bi_private = is_read ? NULL : sbi;
> 
> @@ -273,7 +312,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
> 	down_write(&io->io_rwsem);
> 
> 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
> -	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
> +	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
> +			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
> 		__submit_merged_bio(io);
> alloc_new:
> 	if (io->bio == NULL) {
> @@ -965,7 +1005,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
> {
> 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> 	struct fscrypt_ctx *ctx = NULL;
> -	struct block_device *bdev = sbi->sb->s_bdev;
> 	struct bio *bio;
> 
> 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
> @@ -983,8 +1022,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
> 			fscrypt_release_ctx(ctx);
> 		return ERR_PTR(-ENOMEM);
> 	}
> -	bio->bi_bdev = bdev;
> -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
> +	f2fs_target_device(sbi, blkaddr, bio);
> 	bio->bi_end_io = f2fs_read_end_io;
> 	bio->bi_private = ctx;
> 
> @@ -1079,7 +1117,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
> 		 * This page will go to BIO.  Do we need to send this
> 		 * BIO off first?
> 		 */
> -		if (bio && (last_block_in_bio != block_nr - 1)) {
> +		if (bio && (last_block_in_bio != block_nr - 1 ||
> +			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
> submit_and_realloc:
> 			__submit_bio(F2FS_I_SB(inode), bio, DATA);
> 			bio = NULL;
> @@ -1738,6 +1777,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> 		return 0;
> 	if (test_opt(F2FS_I_SB(inode), LFS))
> 		return 0;
> +	if (F2FS_I_SB(inode)->s_ndevs)
> +		return 0;
> 
> 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
> 
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 9650514..1737c45 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -730,6 +730,20 @@ struct f2fs_bio_info {
> 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
> };
> 
> +#define FDEV(i)				(sbi->devs[i])
> +#define RDEV(i)				(raw_super->devs[i])
> +struct f2fs_dev_info {
> +	struct block_device *bdev;
> +	char path[MAX_PATH_LEN];
> +	unsigned int total_segments;
> +	block_t start_blk;
> +	block_t end_blk;
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	unsigned int nr_blkz;			/* Total number of zones */
> +	u8 *blkz_type;				/* Array of zones type */
> +#endif
> +};
> +
> enum inode_type {
> 	DIR_INODE,			/* for dirty dir inode */
> 	FILE_INODE,			/* for dirty regular/symlink inode */
> @@ -778,10 +792,8 @@ struct f2fs_sb_info {
> #endif
> 
> #ifdef CONFIG_BLK_DEV_ZONED
> -	unsigned int nr_blkz;			/* Total number of zones */
> 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
> 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
> -	u8 *blkz_type;				/* Array of zones type */
> #endif
> 
> 	/* for node-related operations */
> @@ -897,6 +909,8 @@ struct f2fs_sb_info {
> 
> 	/* For shrinker support */
> 	struct list_head s_list;
> +	int s_ndevs;				/* number of devices */
> +	struct f2fs_dev_info *devs;		/* for device list */
> 	struct mutex umount_mutex;
> 	unsigned int shrinker_run_no;
> 
> @@ -2159,6 +2173,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
> void f2fs_flush_merged_bios(struct f2fs_sb_info *);
> int f2fs_submit_page_bio(struct f2fs_io_info *);
> void f2fs_submit_page_mbio(struct f2fs_io_info *);
> +struct block_device *f2fs_target_device(struct f2fs_sb_info *,
> +				block_t, struct bio *);
> +int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
> void set_data_blkaddr(struct dnode_of_data *);
> void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
> int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
> @@ -2446,11 +2463,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
> 
> #ifdef CONFIG_BLK_DEV_ZONED
> static inline int get_blkz_type(struct f2fs_sb_info *sbi,
> -				block_t blkaddr)
> +			struct block_device *bdev, block_t blkaddr)
> {
> 	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
> +	int i;
> 
> -	return sbi->blkz_type[zno];
> +	for (i = 0; i < sbi->s_ndevs; i++)
> +		if (FDEV(i).bdev == bdev)
> +			return FDEV(i).blkz_type[zno];
> +	return -EINVAL;
> }
> #endif
> 
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 7fb7dd3..ef727d1 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -403,6 +403,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
> 	}
> }
> 
> +static int __submit_flush_wait(struct block_device *bdev)
> +{
> +	struct bio *bio = f2fs_bio_alloc(0);
> +	int ret;
> +
> +	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> +	bio->bi_bdev = bdev;
> +	ret = submit_bio_wait(bio);
> +	bio_put(bio);
> +	return ret;
> +}
> +
> +static int submit_flush_wait(struct f2fs_sb_info *sbi)
> +{
> +	int ret = __submit_flush_wait(sbi->sb->s_bdev);
> +	int i;
> +
> +	if (sbi->s_ndevs && !ret) {
> +		for (i = 1; i < sbi->s_ndevs; i++) {
> +			ret = __submit_flush_wait(FDEV(i).bdev);
> +			if (ret)
> +				break;
> +		}
> +	}
> +	return ret;
> +}
> +
> static int issue_flush_thread(void *data)
> {
> 	struct f2fs_sb_info *sbi = data;
> @@ -413,25 +440,18 @@ static int issue_flush_thread(void *data)
> 		return 0;
> 
> 	if (!llist_empty(&fcc->issue_list)) {
> -		struct bio *bio;
> 		struct flush_cmd *cmd, *next;
> 		int ret;
> 
> -		bio = f2fs_bio_alloc(0);
> -
> 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
> 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
> 
> -		bio->bi_bdev = sbi->sb->s_bdev;
> -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> -		ret = submit_bio_wait(bio);
> -
> +		ret = submit_flush_wait(sbi);
> 		llist_for_each_entry_safe(cmd, next,
> 					  fcc->dispatch_list, llnode) {
> 			cmd->ret = ret;
> 			complete(&cmd->wait);
> 		}
> -		bio_put(bio);
> 		fcc->dispatch_list = NULL;
> 	}
> 
> @@ -452,15 +472,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
> 		return 0;
> 
> 	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
> -		struct bio *bio = f2fs_bio_alloc(0);
> 		int ret;
> 
> 		atomic_inc(&fcc->submit_flush);
> -		bio->bi_bdev = sbi->sb->s_bdev;
> -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> -		ret = submit_bio_wait(bio);
> +		ret = submit_flush_wait(sbi);
> 		atomic_dec(&fcc->submit_flush);
> -		bio_put(bio);
> 		return ret;
> 	}
> 
> @@ -637,14 +653,18 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
> 
> /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
> static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
> -				block_t blkstart, block_t blklen)
> +		struct block_device *bdev, block_t blkstart, block_t blklen)
> {
> -	struct block_device *bdev = sbi->sb->s_bdev;
> 	struct bio *bio = NULL;
> 	int err;
> 
> 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
> 
> +	if (sbi->s_ndevs) {
> +		int devi = f2fs_target_device_index(sbi, blkstart);
> +
> +		blkstart -= FDEV(devi).start_blk;
> +	}
> 	err = __blkdev_issue_discard(bdev,
> 				SECTOR_FROM_BLOCK(blkstart),
> 				SECTOR_FROM_BLOCK(blklen),
> @@ -662,18 +682,24 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
> }
> 
> #ifdef CONFIG_BLK_DEV_ZONED
> -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> -					block_t blkstart, block_t blklen)
> +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> +		struct block_device *bdev, block_t blkstart, block_t blklen)
> {
> -	sector_t sector = SECTOR_FROM_BLOCK(blkstart);
> 	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
> -	struct block_device *bdev = sbi->sb->s_bdev;
> +	sector_t sector;
> +	int devi = 0;
> 
> -	if (nr_sects != bdev_zone_size(bdev)) {
> +	if (sbi->s_ndevs) {
> +		devi = f2fs_target_device_index(sbi, blkstart);
> +		blkstart -= FDEV(devi).start_blk;
> +	}
> +	sector = SECTOR_FROM_BLOCK(blkstart);
> +
> +	if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) {
> 		f2fs_msg(sbi->sb, KERN_INFO,
> -			 "Unaligned discard attempted (sector %llu + %llu)",
> -			 (unsigned long long)sector,
> -			 (unsigned long long)nr_sects);
> +			"(%d) %s: Unaligned discard attempted (block %x + %x)",
> +			devi, sbi->s_ndevs ? FDEV(devi).path: "",
> +			blkstart, blklen);
> 		return -EIO;
> 	}
> 
> @@ -682,14 +708,12 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> 	 * use regular discard if the drive supports it. For sequential
> 	 * zones, reset the zone write pointer.
> 	 */
> -	switch (get_blkz_type(sbi, blkstart)) {
> +	switch (get_blkz_type(sbi, bdev, blkstart)) {
> 
> 	case BLK_ZONE_TYPE_CONVENTIONAL:
> 		if (!blk_queue_discard(bdev_get_queue(bdev)))
> 			return 0;
> -		return __f2fs_issue_discard_async(sbi, blkstart,
> -						  blklen);
> -
> +		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
> 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
> 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
> 		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
> @@ -702,14 +726,45 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> }
> #endif
> 
> +static int __issue_discard_async(struct f2fs_sb_info *sbi,
> +		struct block_device *bdev, block_t blkstart, block_t blklen)
> +{
> +#ifdef CONFIG_BLK_DEV_ZONED
> +	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
> +				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
> +		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
> +#endif
> +	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
> +}
> +
> static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
> 				block_t blkstart, block_t blklen)
> {
> +	sector_t start = blkstart, len = 0;
> +	struct block_device *bdev;
> 	struct seg_entry *se;
> 	unsigned int offset;
> 	block_t i;
> +	int err = 0;
> +
> +	bdev = f2fs_target_device(sbi, blkstart, NULL);
> +
> +	for (i = blkstart; i < blkstart + blklen; i++, len++) {
> +		if (i != start) {
> +			struct block_device *bdev2 =
> +				f2fs_target_device(sbi, i, NULL);
> +
> +			if (bdev2 != bdev) {
> +				err = __issue_discard_async(sbi, bdev,
> +						start, len);
> +				if (err)
> +					return err;
> +				bdev = bdev2;
> +				start = i;
> +				len = 0;
> +			}
> +		}
> 
> -	for (i = blkstart; i < blkstart + blklen; i++) {
> 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
> 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
> 
> @@ -717,11 +772,9 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
> 			sbi->discard_blks--;
> 	}
> 
> -#ifdef CONFIG_BLK_DEV_ZONED
> -	if (f2fs_sb_mounted_blkzoned(sbi->sb))
> -		return f2fs_issue_discard_zone(sbi, blkstart, blklen);
> -#endif
> -	return __f2fs_issue_discard_async(sbi, blkstart, blklen);
> +	if (len)
> +		err = __issue_discard_async(sbi, bdev, start, len);
> +	return err;
> }
> 
> static void __add_discard_entry(struct f2fs_sb_info *sbi,
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index eca9aea..4ccbb86 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
> 	percpu_counter_destroy(&sbi->total_valid_inode_count);
> }
> 
> +static void destroy_device_list(struct f2fs_sb_info *sbi)
> +{
> +	int i;
> +
> +	for (i = 0; i < sbi->s_ndevs; i++) {
> +		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
> +#ifdef CONFIG_BLK_DEV_ZONED
> +		kfree(FDEV(i).blkz_type);
> +#endif
> +	}
> +	kfree(sbi->devs);
> +}
> +
> static void f2fs_put_super(struct super_block *sb)
> {
> 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
> @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb)
> 		crypto_free_shash(sbi->s_chksum_driver);
> 	kfree(sbi->raw_super);
> 
> +	destroy_device_list(sbi);
> +
> 	destroy_percpu_info(sbi);
> 	kfree(sbi);
> }
> @@ -1516,9 +1531,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
> }
> 
> #ifdef CONFIG_BLK_DEV_ZONED
> -static int init_blkz_info(struct f2fs_sb_info *sbi)
> +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
> {
> -	struct block_device *bdev = sbi->sb->s_bdev;
> +	struct block_device *bdev = FDEV(devi).bdev;
> 	sector_t nr_sectors = bdev->bd_part->nr_sects;
> 	sector_t sector = 0;
> 	struct blk_zone *zones;
> @@ -1529,15 +1544,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
> 	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
> 		return 0;
> 
> +	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
> +				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
> +		return -EINVAL;
> 	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
> +	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
> +				__ilog2_u32(sbi->blocks_per_blkz))
> +		return -EINVAL;
> 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
> -	sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
> -		sbi->log_blocks_per_blkz;
> +	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
> +					sbi->log_blocks_per_blkz;
> 	if (nr_sectors & (bdev_zone_size(bdev) - 1))
> -		sbi->nr_blkz++;
> +		FDEV(devi).nr_blkz++;
> 
> -	sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL);
> -	if (!sbi->blkz_type)
> +	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
> +	if (!FDEV(devi).blkz_type)
> 		return -ENOMEM;
> 
> #define F2FS_REPORT_NR_ZONES   4096
> @@ -1562,7 +1583,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
> 		}
> 
> 		for (i = 0; i < nr_zones; i++) {
> -			sbi->blkz_type[n] = zones[i].type;
> +			FDEV(devi).blkz_type[n] = zones[i].type;
> 			sector += zones[i].len;
> 			n++;
> 		}
> @@ -1666,6 +1687,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
> 	return err;
> }
> 
> +static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
> +{
> +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
> +	int i;
> +
> +	for (i = 0; i < MAX_DEVICES; i++) {
> +		if (!RDEV(i).path[0])
> +			return 0;
> +
> +		if (i == 0) {
> +			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
> +						MAX_DEVICES, GFP_KERNEL);
> +			if (!sbi->devs)
> +				return -ENOMEM;
> +		}
> +
> +		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
> +		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
> +		if (i == 0) {
> +			FDEV(i).start_blk = 0;
> +			FDEV(i).end_blk = FDEV(i).start_blk +
> +				(FDEV(i).total_segments <<
> +				sbi->log_blocks_per_seg) - 1 +
> +				le32_to_cpu(raw_super->segment0_blkaddr);
> +		} else {
> +			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
> +			FDEV(i).end_blk = FDEV(i).start_blk +
> +				(FDEV(i).total_segments <<
> +				sbi->log_blocks_per_seg) - 1;
> +		}
> +
> +		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
> +					sbi->sb->s_mode, sbi->sb->s_type);
> +		if (IS_ERR(FDEV(i).bdev))
> +			return PTR_ERR(FDEV(i).bdev);
> +
> +		/* to release errored devices */
> +		sbi->s_ndevs = i + 1;
> +
> +#ifdef CONFIG_BLK_DEV_ZONED
> +		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
> +				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
> +			f2fs_msg(sbi->sb, KERN_ERR,
> +				"Zoned block device feature not enabled\n");
> +			return -EINVAL;
> +		}
> +		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
> +			if (init_blkz_info(sbi, i)) {
> +				f2fs_msg(sbi->sb, KERN_ERR,
> +					"Failed to initialize F2FS blkzone information");
> +				return -EINVAL;
> +			}
> +			f2fs_msg(sbi->sb, KERN_INFO,
> +				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
> +				i, FDEV(i).path,
> +				FDEV(i).total_segments,
> +				FDEV(i).start_blk, FDEV(i).end_blk,
> +				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
> +				"Host-aware" : "Host-managed");
> +			continue;
> +		}
> +#endif
> +		f2fs_msg(sbi->sb, KERN_INFO,
> +			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
> +				i, FDEV(i).path,
> +				FDEV(i).total_segments,
> +				FDEV(i).start_blk, FDEV(i).end_blk);
> +	}
> +	return 0;
> +}
> +
> static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> {
> 	struct f2fs_sb_info *sbi;
> @@ -1724,15 +1816,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> 			 "Zoned block device support is not enabled\n");
> 		goto free_sb_buf;
> 	}
> -#else
> -	if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM &&
> -	    !f2fs_sb_mounted_blkzoned(sb)) {
> -		f2fs_msg(sb, KERN_ERR,
> -			 "Zoned block device feature not enabled\n");
> -		goto free_sb_buf;
> -	}
> #endif
> -
> 	default_options(sbi);
> 	/* parse mount options */
> 	options = kstrdup((const char *)data, GFP_KERNEL);
> @@ -1802,6 +1886,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> 		goto free_meta_inode;
> 	}
> 
> +	/* Initialize device list */
> +	err = f2fs_scan_devices(sbi);
> +	if (err) {
> +		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
> +		goto free_devices;
> +	}
> +
> 	sbi->total_valid_node_count =
> 				le32_to_cpu(sbi->ckpt->valid_node_count);
> 	percpu_counter_set(&sbi->total_valid_inode_count,
> @@ -1820,15 +1911,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> 
> 	init_ino_entry_info(sbi);
> 
> -#ifdef CONFIG_BLK_DEV_ZONED
> -	err = init_blkz_info(sbi);
> -	if (err) {
> -		f2fs_msg(sb, KERN_ERR,
> -			"Failed to initialize F2FS blkzone information");
> -		goto free_blkz;
> -	}
> -#endif
> -
> 	/* setup f2fs internal modules */
> 	err = build_segment_manager(sbi);
> 	if (err) {
> @@ -2007,10 +2089,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> 	destroy_node_manager(sbi);
> free_sm:
> 	destroy_segment_manager(sbi);
> -#ifdef CONFIG_BLK_DEV_ZONED
> -free_blkz:
> -	kfree(sbi->blkz_type);
> -#endif
> +free_devices:
> +	destroy_device_list(sbi);
> 	kfree(sbi->ckpt);
> free_meta_inode:
> 	make_bad_inode(sbi->meta_inode);
> diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
> index 422630b..cea41a1 100644
> --- a/include/linux/f2fs_fs.h
> +++ b/include/linux/f2fs_fs.h
> @@ -52,10 +52,17 @@
> 
> #define VERSION_LEN	256
> #define MAX_VOLUME_NAME		512
> +#define MAX_PATH_LEN		64
> +#define MAX_DEVICES		8
> 
> /*
>  * For superblock
>  */
> +struct f2fs_device {
> +	__u8 path[MAX_PATH_LEN];
> +	__le32 total_segments;
> +} __packed;
> +
> struct f2fs_super_block {
> 	__le32 magic;			/* Magic Number */
> 	__le16 major_ver;		/* Major Version */
> @@ -94,7 +101,8 @@ struct f2fs_super_block {
> 	__le32 feature;			/* defined features */
> 	__u8 encryption_level;		/* versioning level for encryption */
> 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
> -	__u8 reserved[871];		/* valid reserved region */
> +	struct f2fs_device devs[MAX_DEVICES];	/* device list */
> +	__u8 reserved[327];		/* valid reserved region */
> } __packed;
> 
> /*
> --
> 2.8.3

Cheers, Andreas

[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] f2fs: support multiple devices
  2016-11-09 22:57 ` Andreas Dilger
@ 2016-11-09 23:05   ` Darrick J. Wong
  2016-11-10  1:12   ` Jaegeuk Kim
  2016-11-10  2:29   ` Qu Wenruo
  2 siblings, 0 replies; 7+ messages in thread
From: Darrick J. Wong @ 2016-11-09 23:05 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Jaegeuk Kim, LKML, Lustre Development, linux-fsdevel,
	linux-f2fs-devel, linux-btrfs

On Wed, Nov 09, 2016 at 03:57:53PM -0700, Andreas Dilger wrote:
> On Nov 9, 2016, at 1:56 PM, Jaegeuk Kim <jaegeuk@kernel.org> wrote:
> > 
> > This patch implements multiple devices support for f2fs.
> > Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big
> > volume under one f2fs instance.
> > 
> > Internal block management is very simple, but we will modify block
> > allocation and background GC policy to boost IO speed by exploiting them
> > accoording to each device speed.
> 
> How will you integrate this into FIEMAP, since it is now possible if a
> file is split across multiple devices then it will return ambiguous block
> numbers for a file.  I've been meaning to merge the FIEMAP handling in
> Lustre to support multiple devices in a single filesystem, so that this
> can be detected in userspace.
> 
> struct ll_fiemap_extent {
>         __u64 fe_logical;  /* logical offset in bytes for the start of
>                             * the extent from the beginning of the file
>                             */
>         __u64 fe_physical; /* physical offset in bytes for the start
>                             * of the extent from the beginning of the disk
>                             */
>         __u64 fe_length;   /* length in bytes for this extent */
>         __u64 fe_reserved64[2];
>         __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
>         __u32 fe_device;   /* device number for this extent */
>         __u32 fe_reserved[2];
> };
> 
> This adds the 32-bit "fe_device" field, which would optionally be filled
> in by the filesystem (zero otherwise).  It would return the kernel device
> number (i.e. st_dev), or for network filesystem (with FIEMAP_EXTENT_NET
> set) this could just return an integer device number since the device
> number is meaningless (and may conflict) on a remote system.

Same field size and similar meaning as the the fmr_device field in
GETFSMAP, at least for blockdev based filesystems.  Intriguing.... ;)

For GETFSMAP the number in fmr_device is either a device-unique 32-bit
cookie or the implementer can set a magic FMH_OF_DEV_T flags bit marking
it as an actual dev_t.

What's the fe_device number mean for network filesystems?  Magic unique
number?

> Since AFAIK Btrfs also has multiple device support there are an increasing
> number of places where this would be useful.

Ditto XFS.

--D

> 
> Cheers, Andreas
> 
> > 
> > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > ---
> > fs/f2fs/data.c          |  55 ++++++++++++++++---
> > fs/f2fs/f2fs.h          |  29 ++++++++--
> > fs/f2fs/segment.c       | 119 +++++++++++++++++++++++++++++------------
> > fs/f2fs/super.c         | 138 ++++++++++++++++++++++++++++++++++++++----------
> > include/linux/f2fs_fs.h |  10 +++-
> > 5 files changed, 277 insertions(+), 74 deletions(-)
> > 
> > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > index 47ded0c..e2be24e 100644
> > --- a/fs/f2fs/data.c
> > +++ b/fs/f2fs/data.c
> > @@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio)
> > }
> > 
> > /*
> > + * Return true, if pre_bio's bdev is same as its target device.
> > + */
> > +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
> > +				block_t blk_addr, struct bio *bio)
> > +{
> > +	struct block_device *bdev = sbi->sb->s_bdev;
> > +	int i;
> > +
> > +	for (i = 0; i < sbi->s_ndevs; i++) {
> > +		if (FDEV(i).start_blk <= blk_addr &&
> > +					FDEV(i).end_blk >= blk_addr) {
> > +			blk_addr -= FDEV(i).start_blk;
> > +			bdev = FDEV(i).bdev;
> > +			break;
> > +		}
> > +	}
> > +	if (bio) {
> > +		bio->bi_bdev = bdev;
> > +		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
> > +	}
> > +	return bdev;
> > +}
> > +
> > +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < sbi->s_ndevs; i++)
> > +		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
> > +			return i;
> > +	return 0;
> > +}
> > +
> > +static bool __same_bdev(struct f2fs_sb_info *sbi,
> > +				block_t blk_addr, struct bio *bio)
> > +{
> > +	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
> > +}
> > +
> > +/*
> >  * Low-level block read/write IO operations.
> >  */
> > static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
> > @@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
> > 
> > 	bio = f2fs_bio_alloc(npages);
> > 
> > -	bio->bi_bdev = sbi->sb->s_bdev;
> > -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
> > +	f2fs_target_device(sbi, blk_addr, bio);
> > 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
> > 	bio->bi_private = is_read ? NULL : sbi;
> > 
> > @@ -273,7 +312,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
> > 	down_write(&io->io_rwsem);
> > 
> > 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
> > -	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
> > +	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
> > +			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
> > 		__submit_merged_bio(io);
> > alloc_new:
> > 	if (io->bio == NULL) {
> > @@ -965,7 +1005,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
> > {
> > 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > 	struct fscrypt_ctx *ctx = NULL;
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > 	struct bio *bio;
> > 
> > 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
> > @@ -983,8 +1022,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
> > 			fscrypt_release_ctx(ctx);
> > 		return ERR_PTR(-ENOMEM);
> > 	}
> > -	bio->bi_bdev = bdev;
> > -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
> > +	f2fs_target_device(sbi, blkaddr, bio);
> > 	bio->bi_end_io = f2fs_read_end_io;
> > 	bio->bi_private = ctx;
> > 
> > @@ -1079,7 +1117,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
> > 		 * This page will go to BIO.  Do we need to send this
> > 		 * BIO off first?
> > 		 */
> > -		if (bio && (last_block_in_bio != block_nr - 1)) {
> > +		if (bio && (last_block_in_bio != block_nr - 1 ||
> > +			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
> > submit_and_realloc:
> > 			__submit_bio(F2FS_I_SB(inode), bio, DATA);
> > 			bio = NULL;
> > @@ -1738,6 +1777,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> > 		return 0;
> > 	if (test_opt(F2FS_I_SB(inode), LFS))
> > 		return 0;
> > +	if (F2FS_I_SB(inode)->s_ndevs)
> > +		return 0;
> > 
> > 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
> > 
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 9650514..1737c45 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -730,6 +730,20 @@ struct f2fs_bio_info {
> > 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
> > };
> > 
> > +#define FDEV(i)				(sbi->devs[i])
> > +#define RDEV(i)				(raw_super->devs[i])
> > +struct f2fs_dev_info {
> > +	struct block_device *bdev;
> > +	char path[MAX_PATH_LEN];
> > +	unsigned int total_segments;
> > +	block_t start_blk;
> > +	block_t end_blk;
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +	unsigned int nr_blkz;			/* Total number of zones */
> > +	u8 *blkz_type;				/* Array of zones type */
> > +#endif
> > +};
> > +
> > enum inode_type {
> > 	DIR_INODE,			/* for dirty dir inode */
> > 	FILE_INODE,			/* for dirty regular/symlink inode */
> > @@ -778,10 +792,8 @@ struct f2fs_sb_info {
> > #endif
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > -	unsigned int nr_blkz;			/* Total number of zones */
> > 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
> > 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
> > -	u8 *blkz_type;				/* Array of zones type */
> > #endif
> > 
> > 	/* for node-related operations */
> > @@ -897,6 +909,8 @@ struct f2fs_sb_info {
> > 
> > 	/* For shrinker support */
> > 	struct list_head s_list;
> > +	int s_ndevs;				/* number of devices */
> > +	struct f2fs_dev_info *devs;		/* for device list */
> > 	struct mutex umount_mutex;
> > 	unsigned int shrinker_run_no;
> > 
> > @@ -2159,6 +2173,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
> > void f2fs_flush_merged_bios(struct f2fs_sb_info *);
> > int f2fs_submit_page_bio(struct f2fs_io_info *);
> > void f2fs_submit_page_mbio(struct f2fs_io_info *);
> > +struct block_device *f2fs_target_device(struct f2fs_sb_info *,
> > +				block_t, struct bio *);
> > +int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
> > void set_data_blkaddr(struct dnode_of_data *);
> > void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
> > int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
> > @@ -2446,11 +2463,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > static inline int get_blkz_type(struct f2fs_sb_info *sbi,
> > -				block_t blkaddr)
> > +			struct block_device *bdev, block_t blkaddr)
> > {
> > 	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
> > +	int i;
> > 
> > -	return sbi->blkz_type[zno];
> > +	for (i = 0; i < sbi->s_ndevs; i++)
> > +		if (FDEV(i).bdev == bdev)
> > +			return FDEV(i).blkz_type[zno];
> > +	return -EINVAL;
> > }
> > #endif
> > 
> > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> > index 7fb7dd3..ef727d1 100644
> > --- a/fs/f2fs/segment.c
> > +++ b/fs/f2fs/segment.c
> > @@ -403,6 +403,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
> > 	}
> > }
> > 
> > +static int __submit_flush_wait(struct block_device *bdev)
> > +{
> > +	struct bio *bio = f2fs_bio_alloc(0);
> > +	int ret;
> > +
> > +	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> > +	bio->bi_bdev = bdev;
> > +	ret = submit_bio_wait(bio);
> > +	bio_put(bio);
> > +	return ret;
> > +}
> > +
> > +static int submit_flush_wait(struct f2fs_sb_info *sbi)
> > +{
> > +	int ret = __submit_flush_wait(sbi->sb->s_bdev);
> > +	int i;
> > +
> > +	if (sbi->s_ndevs && !ret) {
> > +		for (i = 1; i < sbi->s_ndevs; i++) {
> > +			ret = __submit_flush_wait(FDEV(i).bdev);
> > +			if (ret)
> > +				break;
> > +		}
> > +	}
> > +	return ret;
> > +}
> > +
> > static int issue_flush_thread(void *data)
> > {
> > 	struct f2fs_sb_info *sbi = data;
> > @@ -413,25 +440,18 @@ static int issue_flush_thread(void *data)
> > 		return 0;
> > 
> > 	if (!llist_empty(&fcc->issue_list)) {
> > -		struct bio *bio;
> > 		struct flush_cmd *cmd, *next;
> > 		int ret;
> > 
> > -		bio = f2fs_bio_alloc(0);
> > -
> > 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
> > 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
> > 
> > -		bio->bi_bdev = sbi->sb->s_bdev;
> > -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> > -		ret = submit_bio_wait(bio);
> > -
> > +		ret = submit_flush_wait(sbi);
> > 		llist_for_each_entry_safe(cmd, next,
> > 					  fcc->dispatch_list, llnode) {
> > 			cmd->ret = ret;
> > 			complete(&cmd->wait);
> > 		}
> > -		bio_put(bio);
> > 		fcc->dispatch_list = NULL;
> > 	}
> > 
> > @@ -452,15 +472,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
> > 		return 0;
> > 
> > 	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
> > -		struct bio *bio = f2fs_bio_alloc(0);
> > 		int ret;
> > 
> > 		atomic_inc(&fcc->submit_flush);
> > -		bio->bi_bdev = sbi->sb->s_bdev;
> > -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> > -		ret = submit_bio_wait(bio);
> > +		ret = submit_flush_wait(sbi);
> > 		atomic_dec(&fcc->submit_flush);
> > -		bio_put(bio);
> > 		return ret;
> > 	}
> > 
> > @@ -637,14 +653,18 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
> > 
> > /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
> > static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
> > -				block_t blkstart, block_t blklen)
> > +		struct block_device *bdev, block_t blkstart, block_t blklen)
> > {
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > 	struct bio *bio = NULL;
> > 	int err;
> > 
> > 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
> > 
> > +	if (sbi->s_ndevs) {
> > +		int devi = f2fs_target_device_index(sbi, blkstart);
> > +
> > +		blkstart -= FDEV(devi).start_blk;
> > +	}
> > 	err = __blkdev_issue_discard(bdev,
> > 				SECTOR_FROM_BLOCK(blkstart),
> > 				SECTOR_FROM_BLOCK(blklen),
> > @@ -662,18 +682,24 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
> > }
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > -					block_t blkstart, block_t blklen)
> > +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > +		struct block_device *bdev, block_t blkstart, block_t blklen)
> > {
> > -	sector_t sector = SECTOR_FROM_BLOCK(blkstart);
> > 	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > +	sector_t sector;
> > +	int devi = 0;
> > 
> > -	if (nr_sects != bdev_zone_size(bdev)) {
> > +	if (sbi->s_ndevs) {
> > +		devi = f2fs_target_device_index(sbi, blkstart);
> > +		blkstart -= FDEV(devi).start_blk;
> > +	}
> > +	sector = SECTOR_FROM_BLOCK(blkstart);
> > +
> > +	if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) {
> > 		f2fs_msg(sbi->sb, KERN_INFO,
> > -			 "Unaligned discard attempted (sector %llu + %llu)",
> > -			 (unsigned long long)sector,
> > -			 (unsigned long long)nr_sects);
> > +			"(%d) %s: Unaligned discard attempted (block %x + %x)",
> > +			devi, sbi->s_ndevs ? FDEV(devi).path: "",
> > +			blkstart, blklen);
> > 		return -EIO;
> > 	}
> > 
> > @@ -682,14 +708,12 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > 	 * use regular discard if the drive supports it. For sequential
> > 	 * zones, reset the zone write pointer.
> > 	 */
> > -	switch (get_blkz_type(sbi, blkstart)) {
> > +	switch (get_blkz_type(sbi, bdev, blkstart)) {
> > 
> > 	case BLK_ZONE_TYPE_CONVENTIONAL:
> > 		if (!blk_queue_discard(bdev_get_queue(bdev)))
> > 			return 0;
> > -		return __f2fs_issue_discard_async(sbi, blkstart,
> > -						  blklen);
> > -
> > +		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
> > 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
> > 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
> > 		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
> > @@ -702,14 +726,45 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > }
> > #endif
> > 
> > +static int __issue_discard_async(struct f2fs_sb_info *sbi,
> > +		struct block_device *bdev, block_t blkstart, block_t blklen)
> > +{
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
> > +				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
> > +		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
> > +#endif
> > +	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
> > +}
> > +
> > static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
> > 				block_t blkstart, block_t blklen)
> > {
> > +	sector_t start = blkstart, len = 0;
> > +	struct block_device *bdev;
> > 	struct seg_entry *se;
> > 	unsigned int offset;
> > 	block_t i;
> > +	int err = 0;
> > +
> > +	bdev = f2fs_target_device(sbi, blkstart, NULL);
> > +
> > +	for (i = blkstart; i < blkstart + blklen; i++, len++) {
> > +		if (i != start) {
> > +			struct block_device *bdev2 =
> > +				f2fs_target_device(sbi, i, NULL);
> > +
> > +			if (bdev2 != bdev) {
> > +				err = __issue_discard_async(sbi, bdev,
> > +						start, len);
> > +				if (err)
> > +					return err;
> > +				bdev = bdev2;
> > +				start = i;
> > +				len = 0;
> > +			}
> > +		}
> > 
> > -	for (i = blkstart; i < blkstart + blklen; i++) {
> > 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
> > 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
> > 
> > @@ -717,11 +772,9 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
> > 			sbi->discard_blks--;
> > 	}
> > 
> > -#ifdef CONFIG_BLK_DEV_ZONED
> > -	if (f2fs_sb_mounted_blkzoned(sbi->sb))
> > -		return f2fs_issue_discard_zone(sbi, blkstart, blklen);
> > -#endif
> > -	return __f2fs_issue_discard_async(sbi, blkstart, blklen);
> > +	if (len)
> > +		err = __issue_discard_async(sbi, bdev, start, len);
> > +	return err;
> > }
> > 
> > static void __add_discard_entry(struct f2fs_sb_info *sbi,
> > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> > index eca9aea..4ccbb86 100644
> > --- a/fs/f2fs/super.c
> > +++ b/fs/f2fs/super.c
> > @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
> > 	percpu_counter_destroy(&sbi->total_valid_inode_count);
> > }
> > 
> > +static void destroy_device_list(struct f2fs_sb_info *sbi)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < sbi->s_ndevs; i++) {
> > +		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +		kfree(FDEV(i).blkz_type);
> > +#endif
> > +	}
> > +	kfree(sbi->devs);
> > +}
> > +
> > static void f2fs_put_super(struct super_block *sb)
> > {
> > 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
> > @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb)
> > 		crypto_free_shash(sbi->s_chksum_driver);
> > 	kfree(sbi->raw_super);
> > 
> > +	destroy_device_list(sbi);
> > +
> > 	destroy_percpu_info(sbi);
> > 	kfree(sbi);
> > }
> > @@ -1516,9 +1531,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
> > }
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > -static int init_blkz_info(struct f2fs_sb_info *sbi)
> > +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
> > {
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > +	struct block_device *bdev = FDEV(devi).bdev;
> > 	sector_t nr_sectors = bdev->bd_part->nr_sects;
> > 	sector_t sector = 0;
> > 	struct blk_zone *zones;
> > @@ -1529,15 +1544,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
> > 	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
> > 		return 0;
> > 
> > +	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
> > +				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
> > +		return -EINVAL;
> > 	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
> > +	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
> > +				__ilog2_u32(sbi->blocks_per_blkz))
> > +		return -EINVAL;
> > 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
> > -	sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
> > -		sbi->log_blocks_per_blkz;
> > +	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
> > +					sbi->log_blocks_per_blkz;
> > 	if (nr_sectors & (bdev_zone_size(bdev) - 1))
> > -		sbi->nr_blkz++;
> > +		FDEV(devi).nr_blkz++;
> > 
> > -	sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL);
> > -	if (!sbi->blkz_type)
> > +	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
> > +	if (!FDEV(devi).blkz_type)
> > 		return -ENOMEM;
> > 
> > #define F2FS_REPORT_NR_ZONES   4096
> > @@ -1562,7 +1583,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
> > 		}
> > 
> > 		for (i = 0; i < nr_zones; i++) {
> > -			sbi->blkz_type[n] = zones[i].type;
> > +			FDEV(devi).blkz_type[n] = zones[i].type;
> > 			sector += zones[i].len;
> > 			n++;
> > 		}
> > @@ -1666,6 +1687,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
> > 	return err;
> > }
> > 
> > +static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
> > +{
> > +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
> > +	int i;
> > +
> > +	for (i = 0; i < MAX_DEVICES; i++) {
> > +		if (!RDEV(i).path[0])
> > +			return 0;
> > +
> > +		if (i == 0) {
> > +			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
> > +						MAX_DEVICES, GFP_KERNEL);
> > +			if (!sbi->devs)
> > +				return -ENOMEM;
> > +		}
> > +
> > +		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
> > +		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
> > +		if (i == 0) {
> > +			FDEV(i).start_blk = 0;
> > +			FDEV(i).end_blk = FDEV(i).start_blk +
> > +				(FDEV(i).total_segments <<
> > +				sbi->log_blocks_per_seg) - 1 +
> > +				le32_to_cpu(raw_super->segment0_blkaddr);
> > +		} else {
> > +			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
> > +			FDEV(i).end_blk = FDEV(i).start_blk +
> > +				(FDEV(i).total_segments <<
> > +				sbi->log_blocks_per_seg) - 1;
> > +		}
> > +
> > +		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
> > +					sbi->sb->s_mode, sbi->sb->s_type);
> > +		if (IS_ERR(FDEV(i).bdev))
> > +			return PTR_ERR(FDEV(i).bdev);
> > +
> > +		/* to release errored devices */
> > +		sbi->s_ndevs = i + 1;
> > +
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
> > +				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
> > +			f2fs_msg(sbi->sb, KERN_ERR,
> > +				"Zoned block device feature not enabled\n");
> > +			return -EINVAL;
> > +		}
> > +		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
> > +			if (init_blkz_info(sbi, i)) {
> > +				f2fs_msg(sbi->sb, KERN_ERR,
> > +					"Failed to initialize F2FS blkzone information");
> > +				return -EINVAL;
> > +			}
> > +			f2fs_msg(sbi->sb, KERN_INFO,
> > +				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
> > +				i, FDEV(i).path,
> > +				FDEV(i).total_segments,
> > +				FDEV(i).start_blk, FDEV(i).end_blk,
> > +				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
> > +				"Host-aware" : "Host-managed");
> > +			continue;
> > +		}
> > +#endif
> > +		f2fs_msg(sbi->sb, KERN_INFO,
> > +			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
> > +				i, FDEV(i).path,
> > +				FDEV(i).total_segments,
> > +				FDEV(i).start_blk, FDEV(i).end_blk);
> > +	}
> > +	return 0;
> > +}
> > +
> > static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > {
> > 	struct f2fs_sb_info *sbi;
> > @@ -1724,15 +1816,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 			 "Zoned block device support is not enabled\n");
> > 		goto free_sb_buf;
> > 	}
> > -#else
> > -	if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM &&
> > -	    !f2fs_sb_mounted_blkzoned(sb)) {
> > -		f2fs_msg(sb, KERN_ERR,
> > -			 "Zoned block device feature not enabled\n");
> > -		goto free_sb_buf;
> > -	}
> > #endif
> > -
> > 	default_options(sbi);
> > 	/* parse mount options */
> > 	options = kstrdup((const char *)data, GFP_KERNEL);
> > @@ -1802,6 +1886,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 		goto free_meta_inode;
> > 	}
> > 
> > +	/* Initialize device list */
> > +	err = f2fs_scan_devices(sbi);
> > +	if (err) {
> > +		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
> > +		goto free_devices;
> > +	}
> > +
> > 	sbi->total_valid_node_count =
> > 				le32_to_cpu(sbi->ckpt->valid_node_count);
> > 	percpu_counter_set(&sbi->total_valid_inode_count,
> > @@ -1820,15 +1911,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 
> > 	init_ino_entry_info(sbi);
> > 
> > -#ifdef CONFIG_BLK_DEV_ZONED
> > -	err = init_blkz_info(sbi);
> > -	if (err) {
> > -		f2fs_msg(sb, KERN_ERR,
> > -			"Failed to initialize F2FS blkzone information");
> > -		goto free_blkz;
> > -	}
> > -#endif
> > -
> > 	/* setup f2fs internal modules */
> > 	err = build_segment_manager(sbi);
> > 	if (err) {
> > @@ -2007,10 +2089,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 	destroy_node_manager(sbi);
> > free_sm:
> > 	destroy_segment_manager(sbi);
> > -#ifdef CONFIG_BLK_DEV_ZONED
> > -free_blkz:
> > -	kfree(sbi->blkz_type);
> > -#endif
> > +free_devices:
> > +	destroy_device_list(sbi);
> > 	kfree(sbi->ckpt);
> > free_meta_inode:
> > 	make_bad_inode(sbi->meta_inode);
> > diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
> > index 422630b..cea41a1 100644
> > --- a/include/linux/f2fs_fs.h
> > +++ b/include/linux/f2fs_fs.h
> > @@ -52,10 +52,17 @@
> > 
> > #define VERSION_LEN	256
> > #define MAX_VOLUME_NAME		512
> > +#define MAX_PATH_LEN		64
> > +#define MAX_DEVICES		8
> > 
> > /*
> >  * For superblock
> >  */
> > +struct f2fs_device {
> > +	__u8 path[MAX_PATH_LEN];
> > +	__le32 total_segments;
> > +} __packed;
> > +
> > struct f2fs_super_block {
> > 	__le32 magic;			/* Magic Number */
> > 	__le16 major_ver;		/* Major Version */
> > @@ -94,7 +101,8 @@ struct f2fs_super_block {
> > 	__le32 feature;			/* defined features */
> > 	__u8 encryption_level;		/* versioning level for encryption */
> > 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
> > -	__u8 reserved[871];		/* valid reserved region */
> > +	struct f2fs_device devs[MAX_DEVICES];	/* device list */
> > +	__u8 reserved[327];		/* valid reserved region */
> > } __packed;
> > 
> > /*
> > --
> > 2.8.3
> 
> Cheers, Andreas

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] f2fs: support multiple devices
  2016-11-09 22:57 ` Andreas Dilger
  2016-11-09 23:05   ` Darrick J. Wong
@ 2016-11-10  1:12   ` Jaegeuk Kim
  2016-11-10  2:29   ` Qu Wenruo
  2 siblings, 0 replies; 7+ messages in thread
From: Jaegeuk Kim @ 2016-11-10  1:12 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: LKML, Lustre Development, linux-fsdevel, linux-f2fs-devel, linux-btrfs

On Wed, Nov 09, 2016 at 03:57:53PM -0700, Andreas Dilger wrote:
> On Nov 9, 2016, at 1:56 PM, Jaegeuk Kim <jaegeuk@kernel.org> wrote:
> > 
> > This patch implements multiple devices support for f2fs.
> > Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big
> > volume under one f2fs instance.
> > 
> > Internal block management is very simple, but we will modify block
> > allocation and background GC policy to boost IO speed by exploiting them
> > accoording to each device speed.
> 
> How will you integrate this into FIEMAP, since it is now possible if a
> file is split across multiple devices then it will return ambiguous block
> numbers for a file.  I've been meaning to merge the FIEMAP handling in
> Lustre to support multiple devices in a single filesystem, so that this
> can be detected in userspace.
> 
> struct ll_fiemap_extent {
>         __u64 fe_logical;  /* logical offset in bytes for the start of
>                             * the extent from the beginning of the file
>                             */
>         __u64 fe_physical; /* physical offset in bytes for the start
>                             * of the extent from the beginning of the disk
>                             */
>         __u64 fe_length;   /* length in bytes for this extent */
>         __u64 fe_reserved64[2];
>         __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
>         __u32 fe_device;   /* device number for this extent */
>         __u32 fe_reserved[2];
> };
> 
> This adds the 32-bit "fe_device" field, which would optionally be filled
> in by the filesystem (zero otherwise).  It would return the kernel device
> number (i.e. st_dev), or for network filesystem (with FIEMAP_EXTENT_NET
> set) this could just return an integer device number since the device
> number is meaningless (and may conflict) on a remote system.

Thank you for pointing this out. Indeed, I missed this case in this patch.
The fe_device would be good to handle this. Is there a plan to merge that
change? BTW, how about using __u64 given huge_encode_dev()?

> Since AFAIK Btrfs also has multiple device support there are an increasing
> number of places where this would be useful.

That's cool!
Actually, I think this'd be very versatile for different types of storages such
as open-channel SSD and SMR; this patch already handles HM-SMR.
In F2FS especially, background cleaner can migrate blocks seamlessly across the
devices later.

Thanks,

> 
> Cheers, Andreas
> 
> > 
> > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > ---
> > fs/f2fs/data.c          |  55 ++++++++++++++++---
> > fs/f2fs/f2fs.h          |  29 ++++++++--
> > fs/f2fs/segment.c       | 119 +++++++++++++++++++++++++++++------------
> > fs/f2fs/super.c         | 138 ++++++++++++++++++++++++++++++++++++++----------
> > include/linux/f2fs_fs.h |  10 +++-
> > 5 files changed, 277 insertions(+), 74 deletions(-)
> > 
> > diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
> > index 47ded0c..e2be24e 100644
> > --- a/fs/f2fs/data.c
> > +++ b/fs/f2fs/data.c
> > @@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio)
> > }
> > 
> > /*
> > + * Return true, if pre_bio's bdev is same as its target device.
> > + */
> > +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
> > +				block_t blk_addr, struct bio *bio)
> > +{
> > +	struct block_device *bdev = sbi->sb->s_bdev;
> > +	int i;
> > +
> > +	for (i = 0; i < sbi->s_ndevs; i++) {
> > +		if (FDEV(i).start_blk <= blk_addr &&
> > +					FDEV(i).end_blk >= blk_addr) {
> > +			blk_addr -= FDEV(i).start_blk;
> > +			bdev = FDEV(i).bdev;
> > +			break;
> > +		}
> > +	}
> > +	if (bio) {
> > +		bio->bi_bdev = bdev;
> > +		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
> > +	}
> > +	return bdev;
> > +}
> > +
> > +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < sbi->s_ndevs; i++)
> > +		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
> > +			return i;
> > +	return 0;
> > +}
> > +
> > +static bool __same_bdev(struct f2fs_sb_info *sbi,
> > +				block_t blk_addr, struct bio *bio)
> > +{
> > +	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
> > +}
> > +
> > +/*
> >  * Low-level block read/write IO operations.
> >  */
> > static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
> > @@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
> > 
> > 	bio = f2fs_bio_alloc(npages);
> > 
> > -	bio->bi_bdev = sbi->sb->s_bdev;
> > -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
> > +	f2fs_target_device(sbi, blk_addr, bio);
> > 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
> > 	bio->bi_private = is_read ? NULL : sbi;
> > 
> > @@ -273,7 +312,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
> > 	down_write(&io->io_rwsem);
> > 
> > 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
> > -	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
> > +	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
> > +			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
> > 		__submit_merged_bio(io);
> > alloc_new:
> > 	if (io->bio == NULL) {
> > @@ -965,7 +1005,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
> > {
> > 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > 	struct fscrypt_ctx *ctx = NULL;
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > 	struct bio *bio;
> > 
> > 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
> > @@ -983,8 +1022,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
> > 			fscrypt_release_ctx(ctx);
> > 		return ERR_PTR(-ENOMEM);
> > 	}
> > -	bio->bi_bdev = bdev;
> > -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
> > +	f2fs_target_device(sbi, blkaddr, bio);
> > 	bio->bi_end_io = f2fs_read_end_io;
> > 	bio->bi_private = ctx;
> > 
> > @@ -1079,7 +1117,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
> > 		 * This page will go to BIO.  Do we need to send this
> > 		 * BIO off first?
> > 		 */
> > -		if (bio && (last_block_in_bio != block_nr - 1)) {
> > +		if (bio && (last_block_in_bio != block_nr - 1 ||
> > +			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
> > submit_and_realloc:
> > 			__submit_bio(F2FS_I_SB(inode), bio, DATA);
> > 			bio = NULL;
> > @@ -1738,6 +1777,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> > 		return 0;
> > 	if (test_opt(F2FS_I_SB(inode), LFS))
> > 		return 0;
> > +	if (F2FS_I_SB(inode)->s_ndevs)
> > +		return 0;
> > 
> > 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
> > 
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 9650514..1737c45 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -730,6 +730,20 @@ struct f2fs_bio_info {
> > 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
> > };
> > 
> > +#define FDEV(i)				(sbi->devs[i])
> > +#define RDEV(i)				(raw_super->devs[i])
> > +struct f2fs_dev_info {
> > +	struct block_device *bdev;
> > +	char path[MAX_PATH_LEN];
> > +	unsigned int total_segments;
> > +	block_t start_blk;
> > +	block_t end_blk;
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +	unsigned int nr_blkz;			/* Total number of zones */
> > +	u8 *blkz_type;				/* Array of zones type */
> > +#endif
> > +};
> > +
> > enum inode_type {
> > 	DIR_INODE,			/* for dirty dir inode */
> > 	FILE_INODE,			/* for dirty regular/symlink inode */
> > @@ -778,10 +792,8 @@ struct f2fs_sb_info {
> > #endif
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > -	unsigned int nr_blkz;			/* Total number of zones */
> > 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
> > 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
> > -	u8 *blkz_type;				/* Array of zones type */
> > #endif
> > 
> > 	/* for node-related operations */
> > @@ -897,6 +909,8 @@ struct f2fs_sb_info {
> > 
> > 	/* For shrinker support */
> > 	struct list_head s_list;
> > +	int s_ndevs;				/* number of devices */
> > +	struct f2fs_dev_info *devs;		/* for device list */
> > 	struct mutex umount_mutex;
> > 	unsigned int shrinker_run_no;
> > 
> > @@ -2159,6 +2173,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
> > void f2fs_flush_merged_bios(struct f2fs_sb_info *);
> > int f2fs_submit_page_bio(struct f2fs_io_info *);
> > void f2fs_submit_page_mbio(struct f2fs_io_info *);
> > +struct block_device *f2fs_target_device(struct f2fs_sb_info *,
> > +				block_t, struct bio *);
> > +int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
> > void set_data_blkaddr(struct dnode_of_data *);
> > void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
> > int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
> > @@ -2446,11 +2463,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > static inline int get_blkz_type(struct f2fs_sb_info *sbi,
> > -				block_t blkaddr)
> > +			struct block_device *bdev, block_t blkaddr)
> > {
> > 	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
> > +	int i;
> > 
> > -	return sbi->blkz_type[zno];
> > +	for (i = 0; i < sbi->s_ndevs; i++)
> > +		if (FDEV(i).bdev == bdev)
> > +			return FDEV(i).blkz_type[zno];
> > +	return -EINVAL;
> > }
> > #endif
> > 
> > diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> > index 7fb7dd3..ef727d1 100644
> > --- a/fs/f2fs/segment.c
> > +++ b/fs/f2fs/segment.c
> > @@ -403,6 +403,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
> > 	}
> > }
> > 
> > +static int __submit_flush_wait(struct block_device *bdev)
> > +{
> > +	struct bio *bio = f2fs_bio_alloc(0);
> > +	int ret;
> > +
> > +	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> > +	bio->bi_bdev = bdev;
> > +	ret = submit_bio_wait(bio);
> > +	bio_put(bio);
> > +	return ret;
> > +}
> > +
> > +static int submit_flush_wait(struct f2fs_sb_info *sbi)
> > +{
> > +	int ret = __submit_flush_wait(sbi->sb->s_bdev);
> > +	int i;
> > +
> > +	if (sbi->s_ndevs && !ret) {
> > +		for (i = 1; i < sbi->s_ndevs; i++) {
> > +			ret = __submit_flush_wait(FDEV(i).bdev);
> > +			if (ret)
> > +				break;
> > +		}
> > +	}
> > +	return ret;
> > +}
> > +
> > static int issue_flush_thread(void *data)
> > {
> > 	struct f2fs_sb_info *sbi = data;
> > @@ -413,25 +440,18 @@ static int issue_flush_thread(void *data)
> > 		return 0;
> > 
> > 	if (!llist_empty(&fcc->issue_list)) {
> > -		struct bio *bio;
> > 		struct flush_cmd *cmd, *next;
> > 		int ret;
> > 
> > -		bio = f2fs_bio_alloc(0);
> > -
> > 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
> > 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
> > 
> > -		bio->bi_bdev = sbi->sb->s_bdev;
> > -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> > -		ret = submit_bio_wait(bio);
> > -
> > +		ret = submit_flush_wait(sbi);
> > 		llist_for_each_entry_safe(cmd, next,
> > 					  fcc->dispatch_list, llnode) {
> > 			cmd->ret = ret;
> > 			complete(&cmd->wait);
> > 		}
> > -		bio_put(bio);
> > 		fcc->dispatch_list = NULL;
> > 	}
> > 
> > @@ -452,15 +472,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
> > 		return 0;
> > 
> > 	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
> > -		struct bio *bio = f2fs_bio_alloc(0);
> > 		int ret;
> > 
> > 		atomic_inc(&fcc->submit_flush);
> > -		bio->bi_bdev = sbi->sb->s_bdev;
> > -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
> > -		ret = submit_bio_wait(bio);
> > +		ret = submit_flush_wait(sbi);
> > 		atomic_dec(&fcc->submit_flush);
> > -		bio_put(bio);
> > 		return ret;
> > 	}
> > 
> > @@ -637,14 +653,18 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
> > 
> > /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
> > static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
> > -				block_t blkstart, block_t blklen)
> > +		struct block_device *bdev, block_t blkstart, block_t blklen)
> > {
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > 	struct bio *bio = NULL;
> > 	int err;
> > 
> > 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
> > 
> > +	if (sbi->s_ndevs) {
> > +		int devi = f2fs_target_device_index(sbi, blkstart);
> > +
> > +		blkstart -= FDEV(devi).start_blk;
> > +	}
> > 	err = __blkdev_issue_discard(bdev,
> > 				SECTOR_FROM_BLOCK(blkstart),
> > 				SECTOR_FROM_BLOCK(blklen),
> > @@ -662,18 +682,24 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
> > }
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > -					block_t blkstart, block_t blklen)
> > +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > +		struct block_device *bdev, block_t blkstart, block_t blklen)
> > {
> > -	sector_t sector = SECTOR_FROM_BLOCK(blkstart);
> > 	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > +	sector_t sector;
> > +	int devi = 0;
> > 
> > -	if (nr_sects != bdev_zone_size(bdev)) {
> > +	if (sbi->s_ndevs) {
> > +		devi = f2fs_target_device_index(sbi, blkstart);
> > +		blkstart -= FDEV(devi).start_blk;
> > +	}
> > +	sector = SECTOR_FROM_BLOCK(blkstart);
> > +
> > +	if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) {
> > 		f2fs_msg(sbi->sb, KERN_INFO,
> > -			 "Unaligned discard attempted (sector %llu + %llu)",
> > -			 (unsigned long long)sector,
> > -			 (unsigned long long)nr_sects);
> > +			"(%d) %s: Unaligned discard attempted (block %x + %x)",
> > +			devi, sbi->s_ndevs ? FDEV(devi).path: "",
> > +			blkstart, blklen);
> > 		return -EIO;
> > 	}
> > 
> > @@ -682,14 +708,12 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > 	 * use regular discard if the drive supports it. For sequential
> > 	 * zones, reset the zone write pointer.
> > 	 */
> > -	switch (get_blkz_type(sbi, blkstart)) {
> > +	switch (get_blkz_type(sbi, bdev, blkstart)) {
> > 
> > 	case BLK_ZONE_TYPE_CONVENTIONAL:
> > 		if (!blk_queue_discard(bdev_get_queue(bdev)))
> > 			return 0;
> > -		return __f2fs_issue_discard_async(sbi, blkstart,
> > -						  blklen);
> > -
> > +		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
> > 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
> > 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
> > 		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
> > @@ -702,14 +726,45 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
> > }
> > #endif
> > 
> > +static int __issue_discard_async(struct f2fs_sb_info *sbi,
> > +		struct block_device *bdev, block_t blkstart, block_t blklen)
> > +{
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
> > +				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
> > +		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
> > +#endif
> > +	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
> > +}
> > +
> > static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
> > 				block_t blkstart, block_t blklen)
> > {
> > +	sector_t start = blkstart, len = 0;
> > +	struct block_device *bdev;
> > 	struct seg_entry *se;
> > 	unsigned int offset;
> > 	block_t i;
> > +	int err = 0;
> > +
> > +	bdev = f2fs_target_device(sbi, blkstart, NULL);
> > +
> > +	for (i = blkstart; i < blkstart + blklen; i++, len++) {
> > +		if (i != start) {
> > +			struct block_device *bdev2 =
> > +				f2fs_target_device(sbi, i, NULL);
> > +
> > +			if (bdev2 != bdev) {
> > +				err = __issue_discard_async(sbi, bdev,
> > +						start, len);
> > +				if (err)
> > +					return err;
> > +				bdev = bdev2;
> > +				start = i;
> > +				len = 0;
> > +			}
> > +		}
> > 
> > -	for (i = blkstart; i < blkstart + blklen; i++) {
> > 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
> > 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
> > 
> > @@ -717,11 +772,9 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
> > 			sbi->discard_blks--;
> > 	}
> > 
> > -#ifdef CONFIG_BLK_DEV_ZONED
> > -	if (f2fs_sb_mounted_blkzoned(sbi->sb))
> > -		return f2fs_issue_discard_zone(sbi, blkstart, blklen);
> > -#endif
> > -	return __f2fs_issue_discard_async(sbi, blkstart, blklen);
> > +	if (len)
> > +		err = __issue_discard_async(sbi, bdev, start, len);
> > +	return err;
> > }
> > 
> > static void __add_discard_entry(struct f2fs_sb_info *sbi,
> > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> > index eca9aea..4ccbb86 100644
> > --- a/fs/f2fs/super.c
> > +++ b/fs/f2fs/super.c
> > @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
> > 	percpu_counter_destroy(&sbi->total_valid_inode_count);
> > }
> > 
> > +static void destroy_device_list(struct f2fs_sb_info *sbi)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < sbi->s_ndevs; i++) {
> > +		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +		kfree(FDEV(i).blkz_type);
> > +#endif
> > +	}
> > +	kfree(sbi->devs);
> > +}
> > +
> > static void f2fs_put_super(struct super_block *sb)
> > {
> > 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
> > @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb)
> > 		crypto_free_shash(sbi->s_chksum_driver);
> > 	kfree(sbi->raw_super);
> > 
> > +	destroy_device_list(sbi);
> > +
> > 	destroy_percpu_info(sbi);
> > 	kfree(sbi);
> > }
> > @@ -1516,9 +1531,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
> > }
> > 
> > #ifdef CONFIG_BLK_DEV_ZONED
> > -static int init_blkz_info(struct f2fs_sb_info *sbi)
> > +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
> > {
> > -	struct block_device *bdev = sbi->sb->s_bdev;
> > +	struct block_device *bdev = FDEV(devi).bdev;
> > 	sector_t nr_sectors = bdev->bd_part->nr_sects;
> > 	sector_t sector = 0;
> > 	struct blk_zone *zones;
> > @@ -1529,15 +1544,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
> > 	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
> > 		return 0;
> > 
> > +	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
> > +				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
> > +		return -EINVAL;
> > 	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
> > +	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
> > +				__ilog2_u32(sbi->blocks_per_blkz))
> > +		return -EINVAL;
> > 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
> > -	sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
> > -		sbi->log_blocks_per_blkz;
> > +	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
> > +					sbi->log_blocks_per_blkz;
> > 	if (nr_sectors & (bdev_zone_size(bdev) - 1))
> > -		sbi->nr_blkz++;
> > +		FDEV(devi).nr_blkz++;
> > 
> > -	sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL);
> > -	if (!sbi->blkz_type)
> > +	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
> > +	if (!FDEV(devi).blkz_type)
> > 		return -ENOMEM;
> > 
> > #define F2FS_REPORT_NR_ZONES   4096
> > @@ -1562,7 +1583,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
> > 		}
> > 
> > 		for (i = 0; i < nr_zones; i++) {
> > -			sbi->blkz_type[n] = zones[i].type;
> > +			FDEV(devi).blkz_type[n] = zones[i].type;
> > 			sector += zones[i].len;
> > 			n++;
> > 		}
> > @@ -1666,6 +1687,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
> > 	return err;
> > }
> > 
> > +static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
> > +{
> > +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
> > +	int i;
> > +
> > +	for (i = 0; i < MAX_DEVICES; i++) {
> > +		if (!RDEV(i).path[0])
> > +			return 0;
> > +
> > +		if (i == 0) {
> > +			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
> > +						MAX_DEVICES, GFP_KERNEL);
> > +			if (!sbi->devs)
> > +				return -ENOMEM;
> > +		}
> > +
> > +		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
> > +		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
> > +		if (i == 0) {
> > +			FDEV(i).start_blk = 0;
> > +			FDEV(i).end_blk = FDEV(i).start_blk +
> > +				(FDEV(i).total_segments <<
> > +				sbi->log_blocks_per_seg) - 1 +
> > +				le32_to_cpu(raw_super->segment0_blkaddr);
> > +		} else {
> > +			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
> > +			FDEV(i).end_blk = FDEV(i).start_blk +
> > +				(FDEV(i).total_segments <<
> > +				sbi->log_blocks_per_seg) - 1;
> > +		}
> > +
> > +		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
> > +					sbi->sb->s_mode, sbi->sb->s_type);
> > +		if (IS_ERR(FDEV(i).bdev))
> > +			return PTR_ERR(FDEV(i).bdev);
> > +
> > +		/* to release errored devices */
> > +		sbi->s_ndevs = i + 1;
> > +
> > +#ifdef CONFIG_BLK_DEV_ZONED
> > +		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
> > +				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
> > +			f2fs_msg(sbi->sb, KERN_ERR,
> > +				"Zoned block device feature not enabled\n");
> > +			return -EINVAL;
> > +		}
> > +		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
> > +			if (init_blkz_info(sbi, i)) {
> > +				f2fs_msg(sbi->sb, KERN_ERR,
> > +					"Failed to initialize F2FS blkzone information");
> > +				return -EINVAL;
> > +			}
> > +			f2fs_msg(sbi->sb, KERN_INFO,
> > +				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
> > +				i, FDEV(i).path,
> > +				FDEV(i).total_segments,
> > +				FDEV(i).start_blk, FDEV(i).end_blk,
> > +				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
> > +				"Host-aware" : "Host-managed");
> > +			continue;
> > +		}
> > +#endif
> > +		f2fs_msg(sbi->sb, KERN_INFO,
> > +			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
> > +				i, FDEV(i).path,
> > +				FDEV(i).total_segments,
> > +				FDEV(i).start_blk, FDEV(i).end_blk);
> > +	}
> > +	return 0;
> > +}
> > +
> > static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > {
> > 	struct f2fs_sb_info *sbi;
> > @@ -1724,15 +1816,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 			 "Zoned block device support is not enabled\n");
> > 		goto free_sb_buf;
> > 	}
> > -#else
> > -	if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM &&
> > -	    !f2fs_sb_mounted_blkzoned(sb)) {
> > -		f2fs_msg(sb, KERN_ERR,
> > -			 "Zoned block device feature not enabled\n");
> > -		goto free_sb_buf;
> > -	}
> > #endif
> > -
> > 	default_options(sbi);
> > 	/* parse mount options */
> > 	options = kstrdup((const char *)data, GFP_KERNEL);
> > @@ -1802,6 +1886,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 		goto free_meta_inode;
> > 	}
> > 
> > +	/* Initialize device list */
> > +	err = f2fs_scan_devices(sbi);
> > +	if (err) {
> > +		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
> > +		goto free_devices;
> > +	}
> > +
> > 	sbi->total_valid_node_count =
> > 				le32_to_cpu(sbi->ckpt->valid_node_count);
> > 	percpu_counter_set(&sbi->total_valid_inode_count,
> > @@ -1820,15 +1911,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 
> > 	init_ino_entry_info(sbi);
> > 
> > -#ifdef CONFIG_BLK_DEV_ZONED
> > -	err = init_blkz_info(sbi);
> > -	if (err) {
> > -		f2fs_msg(sb, KERN_ERR,
> > -			"Failed to initialize F2FS blkzone information");
> > -		goto free_blkz;
> > -	}
> > -#endif
> > -
> > 	/* setup f2fs internal modules */
> > 	err = build_segment_manager(sbi);
> > 	if (err) {
> > @@ -2007,10 +2089,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
> > 	destroy_node_manager(sbi);
> > free_sm:
> > 	destroy_segment_manager(sbi);
> > -#ifdef CONFIG_BLK_DEV_ZONED
> > -free_blkz:
> > -	kfree(sbi->blkz_type);
> > -#endif
> > +free_devices:
> > +	destroy_device_list(sbi);
> > 	kfree(sbi->ckpt);
> > free_meta_inode:
> > 	make_bad_inode(sbi->meta_inode);
> > diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
> > index 422630b..cea41a1 100644
> > --- a/include/linux/f2fs_fs.h
> > +++ b/include/linux/f2fs_fs.h
> > @@ -52,10 +52,17 @@
> > 
> > #define VERSION_LEN	256
> > #define MAX_VOLUME_NAME		512
> > +#define MAX_PATH_LEN		64
> > +#define MAX_DEVICES		8
> > 
> > /*
> >  * For superblock
> >  */
> > +struct f2fs_device {
> > +	__u8 path[MAX_PATH_LEN];
> > +	__le32 total_segments;
> > +} __packed;
> > +
> > struct f2fs_super_block {
> > 	__le32 magic;			/* Magic Number */
> > 	__le16 major_ver;		/* Major Version */
> > @@ -94,7 +101,8 @@ struct f2fs_super_block {
> > 	__le32 feature;			/* defined features */
> > 	__u8 encryption_level;		/* versioning level for encryption */
> > 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
> > -	__u8 reserved[871];		/* valid reserved region */
> > +	struct f2fs_device devs[MAX_DEVICES];	/* device list */
> > +	__u8 reserved[327];		/* valid reserved region */
> > } __packed;
> > 
> > /*
> > --
> > 2.8.3
> 
> Cheers, Andreas

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] f2fs: support multiple devices
  2016-11-09 22:57 ` Andreas Dilger
  2016-11-09 23:05   ` Darrick J. Wong
  2016-11-10  1:12   ` Jaegeuk Kim
@ 2016-11-10  2:29   ` Qu Wenruo
  2016-11-10 12:25     ` Austin S. Hemmelgarn
  2 siblings, 1 reply; 7+ messages in thread
From: Qu Wenruo @ 2016-11-10  2:29 UTC (permalink / raw)
  To: Andreas Dilger, Jaegeuk Kim
  Cc: LKML, Lustre Development, linux-fsdevel, linux-f2fs-devel,
	linux-btrfs, Darrick J. Wong



At 11/10/2016 06:57 AM, Andreas Dilger wrote:
> On Nov 9, 2016, at 1:56 PM, Jaegeuk Kim <jaegeuk@kernel.org> wrote:
>>
>> This patch implements multiple devices support for f2fs.
>> Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big
>> volume under one f2fs instance.
>>
>> Internal block management is very simple, but we will modify block
>> allocation and background GC policy to boost IO speed by exploiting them
>> accoording to each device speed.
>
> How will you integrate this into FIEMAP, since it is now possible if a
> file is split across multiple devices then it will return ambiguous block
> numbers for a file.  I've been meaning to merge the FIEMAP handling in
> Lustre to support multiple devices in a single filesystem, so that this
> can be detected in userspace.
>
> struct ll_fiemap_extent {
>         __u64 fe_logical;  /* logical offset in bytes for the start of
>                             * the extent from the beginning of the file
>                             */
>         __u64 fe_physical; /* physical offset in bytes for the start
>                             * of the extent from the beginning of the disk
>                             */
>         __u64 fe_length;   /* length in bytes for this extent */
>         __u64 fe_reserved64[2];
>         __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
>         __u32 fe_device;   /* device number for this extent */
>         __u32 fe_reserved[2];
> };

Btrfs introduce a new layer for multi-device (even for single device).

So fiemap returned by btrfs is never real device bytenr, but logical 
address in btrfs logical address space.
Much like traditional soft RAID.

>
> This adds the 32-bit "fe_device" field, which would optionally be filled
> in by the filesystem (zero otherwise).  It would return the kernel device
> number (i.e. st_dev), or for network filesystem (with FIEMAP_EXTENT_NET
> set) this could just return an integer device number since the device
> number is meaningless (and may conflict) on a remote system.
>
> Since AFAIK Btrfs also has multiple device support there are an increasing
> number of places where this would be useful.

AFAIK, btrfs multi-device is here due to scrub with its data/meta csum.

Unlike device-mapper based multi-device, btrfs has csum so it can detect 
which mirror is correct.
This makes btrfs scrub a little better than soft raid.
For example, for RAID1 if two mirror differs from each other, btrfs can 
find the correct one and rewrite it into the other mirror.

And further more, btrfs supports snapshot and is faster than 
device-mapper based snapshot(LVM).
This makes it a little more worthy to implement multi-device support in 
btrfs.


But for f2fs, no data csum, no snapshot.
I don't really see the point to use so many codes to implement it, 
especially we can use mdadm or LVM to implement it.


Not to mention btrfs multi-device support still has quite a lot of bugs, 
like scrub can corrupt correct data stripes.

Personally speaking, I am not a fan of btrfs multi-device management, 
despite the above advantage.
As the complexity is really not worthy.
(So I think XFS with LVM is much better than Btrfs considering the 
stability)

Thanks,
Qu
>
> Cheers, Andreas
>
>>
>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>> ---
>> fs/f2fs/data.c          |  55 ++++++++++++++++---
>> fs/f2fs/f2fs.h          |  29 ++++++++--
>> fs/f2fs/segment.c       | 119 +++++++++++++++++++++++++++++------------
>> fs/f2fs/super.c         | 138 ++++++++++++++++++++++++++++++++++++++----------
>> include/linux/f2fs_fs.h |  10 +++-
>> 5 files changed, 277 insertions(+), 74 deletions(-)
>>
>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>> index 47ded0c..e2be24e 100644
>> --- a/fs/f2fs/data.c
>> +++ b/fs/f2fs/data.c
>> @@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio)
>> }
>>
>> /*
>> + * Return true, if pre_bio's bdev is same as its target device.
>> + */
>> +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
>> +				block_t blk_addr, struct bio *bio)
>> +{
>> +	struct block_device *bdev = sbi->sb->s_bdev;
>> +	int i;
>> +
>> +	for (i = 0; i < sbi->s_ndevs; i++) {
>> +		if (FDEV(i).start_blk <= blk_addr &&
>> +					FDEV(i).end_blk >= blk_addr) {
>> +			blk_addr -= FDEV(i).start_blk;
>> +			bdev = FDEV(i).bdev;
>> +			break;
>> +		}
>> +	}
>> +	if (bio) {
>> +		bio->bi_bdev = bdev;
>> +		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
>> +	}
>> +	return bdev;
>> +}
>> +
>> +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < sbi->s_ndevs; i++)
>> +		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
>> +			return i;
>> +	return 0;
>> +}
>> +
>> +static bool __same_bdev(struct f2fs_sb_info *sbi,
>> +				block_t blk_addr, struct bio *bio)
>> +{
>> +	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
>> +}
>> +
>> +/*
>>  * Low-level block read/write IO operations.
>>  */
>> static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
>> @@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
>>
>> 	bio = f2fs_bio_alloc(npages);
>>
>> -	bio->bi_bdev = sbi->sb->s_bdev;
>> -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
>> +	f2fs_target_device(sbi, blk_addr, bio);
>> 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
>> 	bio->bi_private = is_read ? NULL : sbi;
>>
>> @@ -273,7 +312,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
>> 	down_write(&io->io_rwsem);
>>
>> 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
>> -	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
>> +	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
>> +			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
>> 		__submit_merged_bio(io);
>> alloc_new:
>> 	if (io->bio == NULL) {
>> @@ -965,7 +1005,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
>> {
>> 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
>> 	struct fscrypt_ctx *ctx = NULL;
>> -	struct block_device *bdev = sbi->sb->s_bdev;
>> 	struct bio *bio;
>>
>> 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
>> @@ -983,8 +1022,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
>> 			fscrypt_release_ctx(ctx);
>> 		return ERR_PTR(-ENOMEM);
>> 	}
>> -	bio->bi_bdev = bdev;
>> -	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
>> +	f2fs_target_device(sbi, blkaddr, bio);
>> 	bio->bi_end_io = f2fs_read_end_io;
>> 	bio->bi_private = ctx;
>>
>> @@ -1079,7 +1117,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
>> 		 * This page will go to BIO.  Do we need to send this
>> 		 * BIO off first?
>> 		 */
>> -		if (bio && (last_block_in_bio != block_nr - 1)) {
>> +		if (bio && (last_block_in_bio != block_nr - 1 ||
>> +			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
>> submit_and_realloc:
>> 			__submit_bio(F2FS_I_SB(inode), bio, DATA);
>> 			bio = NULL;
>> @@ -1738,6 +1777,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>> 		return 0;
>> 	if (test_opt(F2FS_I_SB(inode), LFS))
>> 		return 0;
>> +	if (F2FS_I_SB(inode)->s_ndevs)
>> +		return 0;
>>
>> 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
>>
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 9650514..1737c45 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -730,6 +730,20 @@ struct f2fs_bio_info {
>> 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
>> };
>>
>> +#define FDEV(i)				(sbi->devs[i])
>> +#define RDEV(i)				(raw_super->devs[i])
>> +struct f2fs_dev_info {
>> +	struct block_device *bdev;
>> +	char path[MAX_PATH_LEN];
>> +	unsigned int total_segments;
>> +	block_t start_blk;
>> +	block_t end_blk;
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	unsigned int nr_blkz;			/* Total number of zones */
>> +	u8 *blkz_type;				/* Array of zones type */
>> +#endif
>> +};
>> +
>> enum inode_type {
>> 	DIR_INODE,			/* for dirty dir inode */
>> 	FILE_INODE,			/* for dirty regular/symlink inode */
>> @@ -778,10 +792,8 @@ struct f2fs_sb_info {
>> #endif
>>
>> #ifdef CONFIG_BLK_DEV_ZONED
>> -	unsigned int nr_blkz;			/* Total number of zones */
>> 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
>> 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
>> -	u8 *blkz_type;				/* Array of zones type */
>> #endif
>>
>> 	/* for node-related operations */
>> @@ -897,6 +909,8 @@ struct f2fs_sb_info {
>>
>> 	/* For shrinker support */
>> 	struct list_head s_list;
>> +	int s_ndevs;				/* number of devices */
>> +	struct f2fs_dev_info *devs;		/* for device list */
>> 	struct mutex umount_mutex;
>> 	unsigned int shrinker_run_no;
>>
>> @@ -2159,6 +2173,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
>> void f2fs_flush_merged_bios(struct f2fs_sb_info *);
>> int f2fs_submit_page_bio(struct f2fs_io_info *);
>> void f2fs_submit_page_mbio(struct f2fs_io_info *);
>> +struct block_device *f2fs_target_device(struct f2fs_sb_info *,
>> +				block_t, struct bio *);
>> +int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
>> void set_data_blkaddr(struct dnode_of_data *);
>> void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
>> int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
>> @@ -2446,11 +2463,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
>>
>> #ifdef CONFIG_BLK_DEV_ZONED
>> static inline int get_blkz_type(struct f2fs_sb_info *sbi,
>> -				block_t blkaddr)
>> +			struct block_device *bdev, block_t blkaddr)
>> {
>> 	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
>> +	int i;
>>
>> -	return sbi->blkz_type[zno];
>> +	for (i = 0; i < sbi->s_ndevs; i++)
>> +		if (FDEV(i).bdev == bdev)
>> +			return FDEV(i).blkz_type[zno];
>> +	return -EINVAL;
>> }
>> #endif
>>
>> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
>> index 7fb7dd3..ef727d1 100644
>> --- a/fs/f2fs/segment.c
>> +++ b/fs/f2fs/segment.c
>> @@ -403,6 +403,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
>> 	}
>> }
>>
>> +static int __submit_flush_wait(struct block_device *bdev)
>> +{
>> +	struct bio *bio = f2fs_bio_alloc(0);
>> +	int ret;
>> +
>> +	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
>> +	bio->bi_bdev = bdev;
>> +	ret = submit_bio_wait(bio);
>> +	bio_put(bio);
>> +	return ret;
>> +}
>> +
>> +static int submit_flush_wait(struct f2fs_sb_info *sbi)
>> +{
>> +	int ret = __submit_flush_wait(sbi->sb->s_bdev);
>> +	int i;
>> +
>> +	if (sbi->s_ndevs && !ret) {
>> +		for (i = 1; i < sbi->s_ndevs; i++) {
>> +			ret = __submit_flush_wait(FDEV(i).bdev);
>> +			if (ret)
>> +				break;
>> +		}
>> +	}
>> +	return ret;
>> +}
>> +
>> static int issue_flush_thread(void *data)
>> {
>> 	struct f2fs_sb_info *sbi = data;
>> @@ -413,25 +440,18 @@ static int issue_flush_thread(void *data)
>> 		return 0;
>>
>> 	if (!llist_empty(&fcc->issue_list)) {
>> -		struct bio *bio;
>> 		struct flush_cmd *cmd, *next;
>> 		int ret;
>>
>> -		bio = f2fs_bio_alloc(0);
>> -
>> 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
>> 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
>>
>> -		bio->bi_bdev = sbi->sb->s_bdev;
>> -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
>> -		ret = submit_bio_wait(bio);
>> -
>> +		ret = submit_flush_wait(sbi);
>> 		llist_for_each_entry_safe(cmd, next,
>> 					  fcc->dispatch_list, llnode) {
>> 			cmd->ret = ret;
>> 			complete(&cmd->wait);
>> 		}
>> -		bio_put(bio);
>> 		fcc->dispatch_list = NULL;
>> 	}
>>
>> @@ -452,15 +472,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
>> 		return 0;
>>
>> 	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
>> -		struct bio *bio = f2fs_bio_alloc(0);
>> 		int ret;
>>
>> 		atomic_inc(&fcc->submit_flush);
>> -		bio->bi_bdev = sbi->sb->s_bdev;
>> -		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
>> -		ret = submit_bio_wait(bio);
>> +		ret = submit_flush_wait(sbi);
>> 		atomic_dec(&fcc->submit_flush);
>> -		bio_put(bio);
>> 		return ret;
>> 	}
>>
>> @@ -637,14 +653,18 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
>>
>> /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
>> static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
>> -				block_t blkstart, block_t blklen)
>> +		struct block_device *bdev, block_t blkstart, block_t blklen)
>> {
>> -	struct block_device *bdev = sbi->sb->s_bdev;
>> 	struct bio *bio = NULL;
>> 	int err;
>>
>> 	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
>>
>> +	if (sbi->s_ndevs) {
>> +		int devi = f2fs_target_device_index(sbi, blkstart);
>> +
>> +		blkstart -= FDEV(devi).start_blk;
>> +	}
>> 	err = __blkdev_issue_discard(bdev,
>> 				SECTOR_FROM_BLOCK(blkstart),
>> 				SECTOR_FROM_BLOCK(blklen),
>> @@ -662,18 +682,24 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
>> }
>>
>> #ifdef CONFIG_BLK_DEV_ZONED
>> -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
>> -					block_t blkstart, block_t blklen)
>> +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
>> +		struct block_device *bdev, block_t blkstart, block_t blklen)
>> {
>> -	sector_t sector = SECTOR_FROM_BLOCK(blkstart);
>> 	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
>> -	struct block_device *bdev = sbi->sb->s_bdev;
>> +	sector_t sector;
>> +	int devi = 0;
>>
>> -	if (nr_sects != bdev_zone_size(bdev)) {
>> +	if (sbi->s_ndevs) {
>> +		devi = f2fs_target_device_index(sbi, blkstart);
>> +		blkstart -= FDEV(devi).start_blk;
>> +	}
>> +	sector = SECTOR_FROM_BLOCK(blkstart);
>> +
>> +	if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) {
>> 		f2fs_msg(sbi->sb, KERN_INFO,
>> -			 "Unaligned discard attempted (sector %llu + %llu)",
>> -			 (unsigned long long)sector,
>> -			 (unsigned long long)nr_sects);
>> +			"(%d) %s: Unaligned discard attempted (block %x + %x)",
>> +			devi, sbi->s_ndevs ? FDEV(devi).path: "",
>> +			blkstart, blklen);
>> 		return -EIO;
>> 	}
>>
>> @@ -682,14 +708,12 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
>> 	 * use regular discard if the drive supports it. For sequential
>> 	 * zones, reset the zone write pointer.
>> 	 */
>> -	switch (get_blkz_type(sbi, blkstart)) {
>> +	switch (get_blkz_type(sbi, bdev, blkstart)) {
>>
>> 	case BLK_ZONE_TYPE_CONVENTIONAL:
>> 		if (!blk_queue_discard(bdev_get_queue(bdev)))
>> 			return 0;
>> -		return __f2fs_issue_discard_async(sbi, blkstart,
>> -						  blklen);
>> -
>> +		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
>> 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
>> 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
>> 		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
>> @@ -702,14 +726,45 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
>> }
>> #endif
>>
>> +static int __issue_discard_async(struct f2fs_sb_info *sbi,
>> +		struct block_device *bdev, block_t blkstart, block_t blklen)
>> +{
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
>> +				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
>> +		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
>> +#endif
>> +	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
>> +}
>> +
>> static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
>> 				block_t blkstart, block_t blklen)
>> {
>> +	sector_t start = blkstart, len = 0;
>> +	struct block_device *bdev;
>> 	struct seg_entry *se;
>> 	unsigned int offset;
>> 	block_t i;
>> +	int err = 0;
>> +
>> +	bdev = f2fs_target_device(sbi, blkstart, NULL);
>> +
>> +	for (i = blkstart; i < blkstart + blklen; i++, len++) {
>> +		if (i != start) {
>> +			struct block_device *bdev2 =
>> +				f2fs_target_device(sbi, i, NULL);
>> +
>> +			if (bdev2 != bdev) {
>> +				err = __issue_discard_async(sbi, bdev,
>> +						start, len);
>> +				if (err)
>> +					return err;
>> +				bdev = bdev2;
>> +				start = i;
>> +				len = 0;
>> +			}
>> +		}
>>
>> -	for (i = blkstart; i < blkstart + blklen; i++) {
>> 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
>> 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
>>
>> @@ -717,11 +772,9 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
>> 			sbi->discard_blks--;
>> 	}
>>
>> -#ifdef CONFIG_BLK_DEV_ZONED
>> -	if (f2fs_sb_mounted_blkzoned(sbi->sb))
>> -		return f2fs_issue_discard_zone(sbi, blkstart, blklen);
>> -#endif
>> -	return __f2fs_issue_discard_async(sbi, blkstart, blklen);
>> +	if (len)
>> +		err = __issue_discard_async(sbi, bdev, start, len);
>> +	return err;
>> }
>>
>> static void __add_discard_entry(struct f2fs_sb_info *sbi,
>> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
>> index eca9aea..4ccbb86 100644
>> --- a/fs/f2fs/super.c
>> +++ b/fs/f2fs/super.c
>> @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
>> 	percpu_counter_destroy(&sbi->total_valid_inode_count);
>> }
>>
>> +static void destroy_device_list(struct f2fs_sb_info *sbi)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < sbi->s_ndevs; i++) {
>> +		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +		kfree(FDEV(i).blkz_type);
>> +#endif
>> +	}
>> +	kfree(sbi->devs);
>> +}
>> +
>> static void f2fs_put_super(struct super_block *sb)
>> {
>> 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
>> @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb)
>> 		crypto_free_shash(sbi->s_chksum_driver);
>> 	kfree(sbi->raw_super);
>>
>> +	destroy_device_list(sbi);
>> +
>> 	destroy_percpu_info(sbi);
>> 	kfree(sbi);
>> }
>> @@ -1516,9 +1531,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
>> }
>>
>> #ifdef CONFIG_BLK_DEV_ZONED
>> -static int init_blkz_info(struct f2fs_sb_info *sbi)
>> +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
>> {
>> -	struct block_device *bdev = sbi->sb->s_bdev;
>> +	struct block_device *bdev = FDEV(devi).bdev;
>> 	sector_t nr_sectors = bdev->bd_part->nr_sects;
>> 	sector_t sector = 0;
>> 	struct blk_zone *zones;
>> @@ -1529,15 +1544,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
>> 	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
>> 		return 0;
>>
>> +	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
>> +				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
>> +		return -EINVAL;
>> 	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
>> +	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
>> +				__ilog2_u32(sbi->blocks_per_blkz))
>> +		return -EINVAL;
>> 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
>> -	sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
>> -		sbi->log_blocks_per_blkz;
>> +	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
>> +					sbi->log_blocks_per_blkz;
>> 	if (nr_sectors & (bdev_zone_size(bdev) - 1))
>> -		sbi->nr_blkz++;
>> +		FDEV(devi).nr_blkz++;
>>
>> -	sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL);
>> -	if (!sbi->blkz_type)
>> +	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
>> +	if (!FDEV(devi).blkz_type)
>> 		return -ENOMEM;
>>
>> #define F2FS_REPORT_NR_ZONES   4096
>> @@ -1562,7 +1583,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi)
>> 		}
>>
>> 		for (i = 0; i < nr_zones; i++) {
>> -			sbi->blkz_type[n] = zones[i].type;
>> +			FDEV(devi).blkz_type[n] = zones[i].type;
>> 			sector += zones[i].len;
>> 			n++;
>> 		}
>> @@ -1666,6 +1687,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
>> 	return err;
>> }
>>
>> +static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
>> +{
>> +	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
>> +	int i;
>> +
>> +	for (i = 0; i < MAX_DEVICES; i++) {
>> +		if (!RDEV(i).path[0])
>> +			return 0;
>> +
>> +		if (i == 0) {
>> +			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
>> +						MAX_DEVICES, GFP_KERNEL);
>> +			if (!sbi->devs)
>> +				return -ENOMEM;
>> +		}
>> +
>> +		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
>> +		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
>> +		if (i == 0) {
>> +			FDEV(i).start_blk = 0;
>> +			FDEV(i).end_blk = FDEV(i).start_blk +
>> +				(FDEV(i).total_segments <<
>> +				sbi->log_blocks_per_seg) - 1 +
>> +				le32_to_cpu(raw_super->segment0_blkaddr);
>> +		} else {
>> +			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
>> +			FDEV(i).end_blk = FDEV(i).start_blk +
>> +				(FDEV(i).total_segments <<
>> +				sbi->log_blocks_per_seg) - 1;
>> +		}
>> +
>> +		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
>> +					sbi->sb->s_mode, sbi->sb->s_type);
>> +		if (IS_ERR(FDEV(i).bdev))
>> +			return PTR_ERR(FDEV(i).bdev);
>> +
>> +		/* to release errored devices */
>> +		sbi->s_ndevs = i + 1;
>> +
>> +#ifdef CONFIG_BLK_DEV_ZONED
>> +		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
>> +				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
>> +			f2fs_msg(sbi->sb, KERN_ERR,
>> +				"Zoned block device feature not enabled\n");
>> +			return -EINVAL;
>> +		}
>> +		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
>> +			if (init_blkz_info(sbi, i)) {
>> +				f2fs_msg(sbi->sb, KERN_ERR,
>> +					"Failed to initialize F2FS blkzone information");
>> +				return -EINVAL;
>> +			}
>> +			f2fs_msg(sbi->sb, KERN_INFO,
>> +				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
>> +				i, FDEV(i).path,
>> +				FDEV(i).total_segments,
>> +				FDEV(i).start_blk, FDEV(i).end_blk,
>> +				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
>> +				"Host-aware" : "Host-managed");
>> +			continue;
>> +		}
>> +#endif
>> +		f2fs_msg(sbi->sb, KERN_INFO,
>> +			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
>> +				i, FDEV(i).path,
>> +				FDEV(i).total_segments,
>> +				FDEV(i).start_blk, FDEV(i).end_blk);
>> +	}
>> +	return 0;
>> +}
>> +
>> static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>> {
>> 	struct f2fs_sb_info *sbi;
>> @@ -1724,15 +1816,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>> 			 "Zoned block device support is not enabled\n");
>> 		goto free_sb_buf;
>> 	}
>> -#else
>> -	if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM &&
>> -	    !f2fs_sb_mounted_blkzoned(sb)) {
>> -		f2fs_msg(sb, KERN_ERR,
>> -			 "Zoned block device feature not enabled\n");
>> -		goto free_sb_buf;
>> -	}
>> #endif
>> -
>> 	default_options(sbi);
>> 	/* parse mount options */
>> 	options = kstrdup((const char *)data, GFP_KERNEL);
>> @@ -1802,6 +1886,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>> 		goto free_meta_inode;
>> 	}
>>
>> +	/* Initialize device list */
>> +	err = f2fs_scan_devices(sbi);
>> +	if (err) {
>> +		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
>> +		goto free_devices;
>> +	}
>> +
>> 	sbi->total_valid_node_count =
>> 				le32_to_cpu(sbi->ckpt->valid_node_count);
>> 	percpu_counter_set(&sbi->total_valid_inode_count,
>> @@ -1820,15 +1911,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>>
>> 	init_ino_entry_info(sbi);
>>
>> -#ifdef CONFIG_BLK_DEV_ZONED
>> -	err = init_blkz_info(sbi);
>> -	if (err) {
>> -		f2fs_msg(sb, KERN_ERR,
>> -			"Failed to initialize F2FS blkzone information");
>> -		goto free_blkz;
>> -	}
>> -#endif
>> -
>> 	/* setup f2fs internal modules */
>> 	err = build_segment_manager(sbi);
>> 	if (err) {
>> @@ -2007,10 +2089,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
>> 	destroy_node_manager(sbi);
>> free_sm:
>> 	destroy_segment_manager(sbi);
>> -#ifdef CONFIG_BLK_DEV_ZONED
>> -free_blkz:
>> -	kfree(sbi->blkz_type);
>> -#endif
>> +free_devices:
>> +	destroy_device_list(sbi);
>> 	kfree(sbi->ckpt);
>> free_meta_inode:
>> 	make_bad_inode(sbi->meta_inode);
>> diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
>> index 422630b..cea41a1 100644
>> --- a/include/linux/f2fs_fs.h
>> +++ b/include/linux/f2fs_fs.h
>> @@ -52,10 +52,17 @@
>>
>> #define VERSION_LEN	256
>> #define MAX_VOLUME_NAME		512
>> +#define MAX_PATH_LEN		64
>> +#define MAX_DEVICES		8
>>
>> /*
>>  * For superblock
>>  */
>> +struct f2fs_device {
>> +	__u8 path[MAX_PATH_LEN];
>> +	__le32 total_segments;
>> +} __packed;
>> +
>> struct f2fs_super_block {
>> 	__le32 magic;			/* Magic Number */
>> 	__le16 major_ver;		/* Major Version */
>> @@ -94,7 +101,8 @@ struct f2fs_super_block {
>> 	__le32 feature;			/* defined features */
>> 	__u8 encryption_level;		/* versioning level for encryption */
>> 	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
>> -	__u8 reserved[871];		/* valid reserved region */
>> +	struct f2fs_device devs[MAX_DEVICES];	/* device list */
>> +	__u8 reserved[327];		/* valid reserved region */
>> } __packed;
>>
>> /*
>> --
>> 2.8.3
>
> Cheers, Andreas
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] f2fs: support multiple devices
  2016-11-10  2:29   ` Qu Wenruo
@ 2016-11-10 12:25     ` Austin S. Hemmelgarn
  2016-11-10 16:19       ` Anand Jain
  0 siblings, 1 reply; 7+ messages in thread
From: Austin S. Hemmelgarn @ 2016-11-10 12:25 UTC (permalink / raw)
  To: Qu Wenruo, Andreas Dilger, Jaegeuk Kim
  Cc: LKML, Lustre Development, linux-fsdevel, linux-f2fs-devel,
	linux-btrfs, Darrick J. Wong

On 2016-11-09 21:29, Qu Wenruo wrote:
>
>
> At 11/10/2016 06:57 AM, Andreas Dilger wrote:
>> On Nov 9, 2016, at 1:56 PM, Jaegeuk Kim <jaegeuk@kernel.org> wrote:
>>>
>>> This patch implements multiple devices support for f2fs.
>>> Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big
>>> volume under one f2fs instance.
>>>
>>> Internal block management is very simple, but we will modify block
>>> allocation and background GC policy to boost IO speed by exploiting them
>>> accoording to each device speed.
>>
>> How will you integrate this into FIEMAP, since it is now possible if a
>> file is split across multiple devices then it will return ambiguous block
>> numbers for a file.  I've been meaning to merge the FIEMAP handling in
>> Lustre to support multiple devices in a single filesystem, so that this
>> can be detected in userspace.
>>
>> struct ll_fiemap_extent {
>>         __u64 fe_logical;  /* logical offset in bytes for the start of
>>                             * the extent from the beginning of the file
>>                             */
>>         __u64 fe_physical; /* physical offset in bytes for the start
>>                             * of the extent from the beginning of the
>> disk
>>                             */
>>         __u64 fe_length;   /* length in bytes for this extent */
>>         __u64 fe_reserved64[2];
>>         __u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
>>         __u32 fe_device;   /* device number for this extent */
>>         __u32 fe_reserved[2];
>> };
>
> Btrfs introduce a new layer for multi-device (even for single device).
>
> So fiemap returned by btrfs is never real device bytenr, but logical
> address in btrfs logical address space.
> Much like traditional soft RAID.
This is a really important point.  BTRFS does a good job of segregating 
the layers here, so the file-level allocator really has very limited 
knowledge of the underlying storage, which in turn means that adding 
this to BTRFS would likely be a pretty invasive change for the FIEMAP 
implementation.
>
>>
>> This adds the 32-bit "fe_device" field, which would optionally be filled
>> in by the filesystem (zero otherwise).  It would return the kernel device
>> number (i.e. st_dev), or for network filesystem (with FIEMAP_EXTENT_NET
>> set) this could just return an integer device number since the device
>> number is meaningless (and may conflict) on a remote system.
>>
>> Since AFAIK Btrfs also has multiple device support there are an
>> increasing
>> number of places where this would be useful.
>
> AFAIK, btrfs multi-device is here due to scrub with its data/meta csum.
It's also here for an attempt at parity with ZFS.
>
> Unlike device-mapper based multi-device, btrfs has csum so it can detect
> which mirror is correct.
> This makes btrfs scrub a little better than soft raid.
> For example, for RAID1 if two mirror differs from each other, btrfs can
> find the correct one and rewrite it into the other mirror.
>
> And further more, btrfs supports snapshot and is faster than
> device-mapper based snapshot(LVM).
> This makes it a little more worthy to implement multi-device support in
> btrfs.
>
>
> But for f2fs, no data csum, no snapshot.
> I don't really see the point to use so many codes to implement it,
> especially we can use mdadm or LVM to implement it.
I'd tend to agree on this, if it weren't for the fact that this looks to 
me like preparation for implementing storage tiering, which neither LVM 
nor MD have a good implementation of.  Whether or not such functionality 
is worthwhile for the embedded systems that F2FS typically targets is 
another story of course.
>
>
> Not to mention btrfs multi-device support still has quite a lot of bugs,
> like scrub can corrupt correct data stripes.
This sounds like you're lumping raid5/6 code in with the general 
multi-device code, which is not a good way of describing things for 
multiple reasons.  Pretty much, if you're using just raid1 mode, without 
compression, on reasonable storage devices, things are rock-solid 
relative to the rest of BTRFS.

Yes, there is a bug with compression and multiple copies of things, but 
that requires a pretty spectacular device failure to manifest, and it 
impacts single device mode too (it happens in dup profiles as well as 
raid1).  As far as the raid5/6 stuff, that shouldn't have been merged in 
the state it was in when it got merged, and should probably just be 
rewritten from the ground up.
>
> Personally speaking, I am not a fan of btrfs multi-device management,
> despite the above advantage.
> As the complexity is really not worthy.
> (So I think XFS with LVM is much better than Btrfs considering the
> stability)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] f2fs: support multiple devices
  2016-11-10 12:25     ` Austin S. Hemmelgarn
@ 2016-11-10 16:19       ` Anand Jain
  0 siblings, 0 replies; 7+ messages in thread
From: Anand Jain @ 2016-11-10 16:19 UTC (permalink / raw)
  To: Austin S. Hemmelgarn, Qu Wenruo, Andreas Dilger, Jaegeuk Kim
  Cc: LKML, Lustre Development, linux-fsdevel, linux-f2fs-devel,
	linux-btrfs, Darrick J. Wong



(this is deviating from the subject, sorry about that)

>  Pretty much, if you're using just raid1 mode, without
> compression, on reasonable storage devices, things are rock-solid
> relative to the rest of BTRFS.

IMO, BTRFS volume manger feature is incomplete and there is RAID1
critical bug which affects availability, so its not suitable for 
enterprise solutions yet. However it should be fine in a setup
where dedicated sysadmin and maintenance downtime is a choice.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-11-10 16:16 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-09 20:56 [PATCH] f2fs: support multiple devices Jaegeuk Kim
2016-11-09 22:57 ` Andreas Dilger
2016-11-09 23:05   ` Darrick J. Wong
2016-11-10  1:12   ` Jaegeuk Kim
2016-11-10  2:29   ` Qu Wenruo
2016-11-10 12:25     ` Austin S. Hemmelgarn
2016-11-10 16:19       ` Anand Jain

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).