linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/11] ext4: add handling for extended mount options
@ 2019-07-22  4:00 Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 02/11] jbd2: add fast commit fields to journal_s structure Harshad Shirwadkar
                   ` (10 more replies)
  0 siblings, 11 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

We are running out of mount option bits. This patch adds handling for
using s_mount_opt2 and also adds ability to turn on / off the fast
commit feature. In order to use fast commits, new version e2fsprogs
needs to set the fast feature commit flag. This also makes sure that
we have fast commit compatible e2fsprogs before starting to use the
feature. Mount flag "no_fastcommit", introuced in this patch, can be
passed to disable the feature at mount time.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/ext4.h       |  4 ++++
 fs/ext4/super.c      | 27 ++++++++++++++++++++++-----
 include/linux/jbd2.h |  5 ++++-
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa7a9e0..becbda38b7db 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1146,6 +1146,8 @@ struct ext4_inode_info {
 #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly
 						specified journal checksum */
 
+#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT	0x00000010 /* Journal fast commit */
+
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
 #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
@@ -1643,6 +1645,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
 #define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200
+#define EXT4_FEATURE_COMPAT_FAST_COMMIT		0x0400
 
 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
@@ -1743,6 +1746,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr,		EXT_ATTR)
 EXT4_FEATURE_COMPAT_FUNCS(resize_inode,		RESIZE_INODE)
 EXT4_FEATURE_COMPAT_FUNCS(dir_index,		DIR_INDEX)
 EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,	SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(fast_commit,		FAST_COMMIT)
 
 EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,	SPARSE_SUPER)
 EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,	LARGE_FILE)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..e376ac040cce 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1455,6 +1455,7 @@ enum {
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+	Opt_no_fastcommit
 };
 
 static const match_table_t tokens = {
@@ -1537,6 +1538,7 @@ static const match_table_t tokens = {
 	{Opt_init_itable, "init_itable=%u"},
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_no_fastcommit, "no_fastcommit"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
 	{Opt_nombcache, "nombcache"},
@@ -1659,6 +1661,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 #define MOPT_NO_EXT3	0x0200
 #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
 #define MOPT_STRING	0x0400
+#define MOPT_2		0x0800
 
 static const struct mount_opts {
 	int	token;
@@ -1751,6 +1754,8 @@ static const struct mount_opts {
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
 	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+	{Opt_no_fastcommit, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
+	 MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
 	{Opt_err, 0, 0}
 };
 
@@ -1858,8 +1863,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			set_opt2(sb, EXPLICIT_DELALLOC);
 		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
 			set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
-		} else
+		} else if (m->mount_opt) {
 			return -1;
+		}
 	}
 	if (m->flags & MOPT_CLEAR_ERR)
 		clear_opt(sb, ERRORS_MASK);
@@ -2027,10 +2033,17 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			WARN_ON(1);
 			return -1;
 		}
-		if (arg != 0)
-			sbi->s_mount_opt |= m->mount_opt;
-		else
-			sbi->s_mount_opt &= ~m->mount_opt;
+		if (m->flags & MOPT_2) {
+			if (arg != 0)
+				sbi->s_mount_opt2 |= m->mount_opt;
+			else
+				sbi->s_mount_opt2 &= ~m->mount_opt;
+		} else {
+			if (arg != 0)
+				sbi->s_mount_opt |= m->mount_opt;
+			else
+				sbi->s_mount_opt &= ~m->mount_opt;
+		}
 	}
 	return 1;
 }
@@ -3733,6 +3746,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
+	if (ext4_has_feature_fast_commit(sb))
+		set_opt2(sb, JOURNAL_FAST_COMMIT);
+
 	/* don't forget to enable journal_csum when metadata_csum is enabled. */
 	if (ext4_has_metadata_csum(sb))
 		set_opt(sb, JOURNAL_CHECKSUM);
@@ -4334,6 +4350,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
 		clear_opt(sb, JOURNAL_CHECKSUM);
 		clear_opt(sb, DATA_FLAGS);
+		clear_opt2(sb, JOURNAL_FAST_COMMIT);
 		sbi->s_journal = NULL;
 		needs_recovery = 0;
 		goto no_journal;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index df03825ad1a1..b7eed49b8ecd 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -288,6 +288,7 @@ typedef struct journal_superblock_s
 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT	0x00000004
 #define JBD2_FEATURE_INCOMPAT_CSUM_V2		0x00000008
 #define JBD2_FEATURE_INCOMPAT_CSUM_V3		0x00000010
+#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT	0x00000020
 
 /* See "journal feature predicate functions" below */
 
@@ -298,7 +299,8 @@ typedef struct journal_superblock_s
 					JBD2_FEATURE_INCOMPAT_64BIT | \
 					JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
 					JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
-					JBD2_FEATURE_INCOMPAT_CSUM_V3)
+					JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
+					JBD2_FEATURE_INCOMPAT_FAST_COMMIT)
 
 #ifdef __KERNEL__
 
@@ -1235,6 +1237,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(64bit,		64BIT)
 JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,	ASYNC_COMMIT)
 JBD2_FEATURE_INCOMPAT_FUNCS(csum2,		CSUM_V2)
 JBD2_FEATURE_INCOMPAT_FUNCS(csum3,		CSUM_V3)
+JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 
 /*
  * Journal flag definitions
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 02/11] jbd2: add fast commit fields to journal_s structure
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 03/11] jbd2: fast commit setup and enable Harshad Shirwadkar
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

For fast commits, JBD2 as of now allocates a default of 128 blocks at
the end of the journalling area. Although JBD2 owns these blocks, it
doesn't control what exactly should be written in these blocks. It
just provides the right abstraction for making these blocks usable by
file systems. This patch adds necessary fields to manage these fast
commit blocks.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 include/linux/jbd2.h | 78 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b7eed49b8ecd..6133a0cd22da 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -66,6 +66,7 @@ void __jbd2_debug(int level, const char *file, const char *func,
 extern void *jbd2_alloc(size_t size, gfp_t flags);
 extern void jbd2_free(void *ptr, size_t size);
 
+#define JBD2_FAST_COMMIT_BLOCKS 128
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
 
 #ifdef __KERNEL__
@@ -918,6 +919,34 @@ struct journal_s
 	 */
 	unsigned long		j_last;
 
+	/**
+	 * @j_first_fc:
+	 *
+	 * The block number of the first fast commit block in the journal
+	 */
+	unsigned long		j_first_fc;
+
+	/**
+	 * @j_current_fc:
+	 *
+	 * Journal fc block iterator
+	 */
+	unsigned long		j_fc_off;
+
+	/**
+	 * @j_last_fc:
+	 *
+	 * The block number of the last fast commit block in the journal
+	 */
+	unsigned long		j_last_fc;
+
+	/**
+	 * @j_do_full_commit:
+	 *
+	 * Force a full commit. If this flag is set JBD2 won't try fast commits
+	 */
+	bool			j_do_full_commit;
+
 	/**
 	 * @j_dev: Device where we store the journal.
 	 */
@@ -987,6 +1016,15 @@ struct journal_s
 	 */
 	tid_t			j_transaction_sequence;
 
+	/**
+	 * @j_subtid:
+	 *
+	 * One plus the sequence number of the most recently committed fast
+	 * commit. This represents the sub transaction ID for the next fast
+	 * commit.
+	 */
+	tid_t			j_subtid;
+
 	/**
 	 * @j_commit_sequence:
 	 *
@@ -1068,6 +1106,20 @@ struct journal_s
 	 */
 	int			j_wbufsize;
 
+	/**
+	 * @j_fc_wbuf:
+	 *
+	 * Array of bhs for fast commit transactions
+	 */
+	struct buffer_head	**j_fc_wbuf;
+
+	/**
+	 * @j_fc_wbufsize:
+	 *
+	 * Size of @j_fc_wbufsize array.
+	 */
+	int			j_fc_wbufsize;
+
 	/**
 	 * @j_last_sync_writer:
 	 *
@@ -1167,6 +1219,32 @@ struct journal_s
 	 */
 	struct lockdep_map	j_trans_commit_map;
 #endif
+	/**
+	 * @j_fc_commit_callback:
+	 *
+	 * File-system specific function that performs actual fast commit
+	 * operation. Should return 0 if the fast commit was successful, in that
+	 * case, JBD2 will just increment journal->j_subtid and move on. If it
+	 * returns < 0, JBD2 will fall-back to full commit.
+	 */
+	int (*j_fc_commit_callback)(struct journal_s *journal, tid_t tid,
+				    tid_t subtid);
+	/**
+	 * @j_fc_replay_callback:
+	 *
+	 * File-system specific function that performs replay of a fast
+	 * commit. JBD2 calls this function for each fast commit block found in
+	 * the journal.
+	 */
+	int (*j_fc_replay_callback)(struct journal_s *journal,
+				    struct buffer_head *bh);
+	/**
+	 * @j_fc_cleanup_callback:
+	 *
+	 * Clean-up after fast commit or full commit. JBD2 calls this function
+	 * after every commit operation.
+	 */
+	void (*j_fc_cleanup_callback)(struct journal_s *journal);
 };
 
 #define jbd2_might_wait_for_commit(j) \
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 03/11] jbd2: fast commit setup and enable
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 02/11] jbd2: add fast commit fields to journal_s structure Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 04/11] jbd2: fast-commit commit path changes Harshad Shirwadkar
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

This patch allows file systems to turn fast commits on and thereby
restrict the normal journalling space to total journal blocks minus
JBD2_FAST_COMMIT_BLOCKS. Fast commits are not actually performed, just
the interface to turn fast commits on is opened.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/super.c      |  3 ++-
 fs/jbd2/journal.c    | 39 ++++++++++++++++++++++++++++++++-------
 fs/ocfs2/journal.c   |  4 ++--
 include/linux/jbd2.h |  2 +-
 4 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e376ac040cce..81c3ec165822 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4933,7 +4933,8 @@ static int ext4_load_journal(struct super_block *sb,
 		if (save)
 			memcpy(save, ((char *) es) +
 			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
-		err = jbd2_journal_load(journal);
+		err = jbd2_journal_load(journal,
+					test_opt2(sb, JOURNAL_FAST_COMMIT));
 		if (save)
 			memcpy(((char *) es) + EXT4_S_ERR_START,
 			       save, EXT4_S_ERR_LEN);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 953990eb70a9..59ad709154a3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1159,12 +1159,15 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	journal->j_blk_offset = start;
 	journal->j_maxlen = len;
 	n = journal->j_blocksize / sizeof(journal_block_tag_t);
-	journal->j_wbufsize = n;
+	journal->j_wbufsize = n - JBD2_FAST_COMMIT_BLOCKS;
 	journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
 					GFP_KERNEL);
 	if (!journal->j_wbuf)
 		goto err_cleanup;
 
+	journal->j_fc_wbuf = &journal->j_wbuf[journal->j_wbufsize];
+	journal->j_fc_wbufsize = JBD2_FAST_COMMIT_BLOCKS;
+
 	bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
 	if (!bh) {
 		pr_err("%s: Cannot get buffer for journal superblock\n",
@@ -1297,11 +1300,19 @@ static int journal_reset(journal_t *journal)
 	}
 
 	journal->j_first = first;
-	journal->j_last = last;
 
-	journal->j_head = first;
-	journal->j_tail = first;
-	journal->j_free = last - first;
+	if (jbd2_has_feature_fast_commit(journal)) {
+		journal->j_last_fc = last;
+		journal->j_last = last - JBD2_FAST_COMMIT_BLOCKS;
+		journal->j_first_fc = journal->j_last + 1;
+		journal->j_fc_off = 0;
+	} else {
+		journal->j_last = last;
+	}
+
+	journal->j_head = journal->j_first;
+	journal->j_tail = journal->j_first;
+	journal->j_free = journal->j_last - journal->j_first;
 
 	journal->j_tail_sequence = journal->j_transaction_sequence;
 	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
@@ -1626,9 +1637,17 @@ static int load_superblock(journal_t *journal)
 	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
 	journal->j_tail = be32_to_cpu(sb->s_start);
 	journal->j_first = be32_to_cpu(sb->s_first);
-	journal->j_last = be32_to_cpu(sb->s_maxlen);
 	journal->j_errno = be32_to_cpu(sb->s_errno);
 
+	if (jbd2_has_feature_fast_commit(journal)) {
+		journal->j_last_fc = be32_to_cpu(sb->s_maxlen);
+		journal->j_last = journal->j_last_fc - JBD2_FAST_COMMIT_BLOCKS;
+		journal->j_first_fc = journal->j_last + 1;
+		journal->j_fc_off = 0;
+	} else {
+		journal->j_last = be32_to_cpu(sb->s_maxlen);
+	}
+
 	return 0;
 }
 
@@ -1641,7 +1660,7 @@ static int load_superblock(journal_t *journal)
  * a journal, read the journal from disk to initialise the in-memory
  * structures.
  */
-int jbd2_journal_load(journal_t *journal)
+int jbd2_journal_load(journal_t *journal, bool enable_fc)
 {
 	int err;
 	journal_superblock_t *sb;
@@ -1684,6 +1703,12 @@ int jbd2_journal_load(journal_t *journal)
 		return -EFSCORRUPTED;
 	}
 
+	if (enable_fc)
+		jbd2_journal_set_features(journal, 0, 0,
+					  JBD2_FEATURE_INCOMPAT_FAST_COMMIT);
+	else
+		jbd2_journal_clear_features(journal, 0, 0,
+					    JBD2_FEATURE_INCOMPAT_FAST_COMMIT);
 	/* OK, we've finished with the dynamic journal bits:
 	 * reinitialise the dynamic contents of the superblock in memory
 	 * and reset them on disk. */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 930e3d388579..3b4d91b16e8e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1057,7 +1057,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 
 	osb = journal->j_osb;
 
-	status = jbd2_journal_load(journal->j_journal);
+	status = jbd2_journal_load(journal->j_journal, false);
 	if (status < 0) {
 		mlog(ML_ERROR, "Failed to load journal!\n");
 		goto done;
@@ -1642,7 +1642,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 		goto done;
 	}
 
-	status = jbd2_journal_load(journal);
+	status = jbd2_journal_load(journal, false);
 	if (status < 0) {
 		mlog_errno(status);
 		if (!igrab(inode))
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 6133a0cd22da..389bc7cd2410 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1475,7 +1475,7 @@ extern int	   jbd2_journal_set_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern void	   jbd2_journal_clear_features
 		   (journal_t *, unsigned long, unsigned long, unsigned long);
-extern int	   jbd2_journal_load       (journal_t *journal);
+extern int	   jbd2_journal_load(journal_t *journal, bool enable_fc);
 extern int	   jbd2_journal_destroy    (journal_t *);
 extern int	   jbd2_journal_recover    (journal_t *journal);
 extern int	   jbd2_journal_wipe       (journal_t *, int);
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 04/11] jbd2: fast-commit commit path changes
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 02/11] jbd2: add fast commit fields to journal_s structure Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 03/11] jbd2: fast commit setup and enable Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 05/11] jbd2: fast-commit commit path new APIs Harshad Shirwadkar
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

This patch adds core fast-commit commit path changes. This patch also
modifies existing JBD2 APIs to allow usage of fast commits. If fast
commits are enabled and journal->j_do_full_commit is not set, the
commit routine tries the file system specific fast commmit first. Only
if it fails, it falls back to the full commit. Commit start and wait
APIs now take an additional argument which indicates if fast commits
are allowed or not.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/super.c       |  2 +-
 fs/jbd2/checkpoint.c  |  2 +-
 fs/jbd2/commit.c      | 41 ++++++++++++++++++++++-
 fs/jbd2/journal.c     | 76 ++++++++++++++++++++++++++++++++++---------
 fs/jbd2/transaction.c |  6 ++--
 fs/ocfs2/alloc.c      |  2 +-
 fs/ocfs2/super.c      |  2 +-
 include/linux/jbd2.h  |  8 +++--
 8 files changed, 113 insertions(+), 26 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 81c3ec165822..6bab59ae81f7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5148,7 +5148,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
 			needs_barrier = true;
 
-		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+		if (jbd2_journal_start_commit(sbi->s_journal, &target, true)) {
 			if (wait)
 				ret = jbd2_log_wait_commit(sbi->s_journal,
 							   target);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index a1909066bde6..6297978ae3bc 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -277,7 +277,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 
 			if (batch_count)
 				__flush_batch(journal, &batch_count);
-			jbd2_log_start_commit(journal, tid);
+			jbd2_log_start_commit(journal, tid, true);
 			/*
 			 * jbd2_journal_commit_transaction() may want
 			 * to take the checkpoint_mutex if JBD2_FLUSHED
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 132fb92098c7..241581c23aa3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -351,8 +351,12 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
  *
  * The primary function for committing a transaction to the log.  This
  * function is called by the journal thread to begin a complete commit.
+ *
+ * fc is input / output parameter. If fc is non-null and is set to true, this
+ * function tries to perform fast commit. If the fast commit is successfully
+ * performed, *fc is set to true.
  */
-void jbd2_journal_commit_transaction(journal_t *journal)
+void jbd2_journal_commit_transaction(journal_t *journal, bool *fc)
 {
 	struct transaction_stats_s stats;
 	transaction_t *commit_transaction;
@@ -380,6 +384,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	tid_t first_tid;
 	int update_tail;
 	int csum_size = 0;
+	bool full_commit;
 	LIST_HEAD(io_bufs);
 	LIST_HEAD(log_bufs);
 
@@ -413,6 +418,40 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(journal->j_running_transaction != NULL);
 	J_ASSERT(journal->j_committing_transaction == NULL);
 
+	read_lock(&journal->j_state_lock);
+	full_commit = journal->j_do_full_commit;
+	read_unlock(&journal->j_state_lock);
+
+	/* Let file-system try its own fast commit */
+	if (jbd2_has_feature_fast_commit(journal)) {
+		if (!full_commit && fc && *fc == true &&
+		    journal->j_fc_commit_callback &&
+		    !journal->j_fc_commit_callback(
+			journal, journal->j_running_transaction->t_tid,
+			journal->j_subtid)) {
+			jbd_debug(3, "fast commit success.\n");
+			if (journal->j_fc_cleanup_callback)
+				journal->j_fc_cleanup_callback(journal);
+			write_lock(&journal->j_state_lock);
+			journal->j_subtid++;
+			if (fc)
+				*fc = true;
+			write_unlock(&journal->j_state_lock);
+			return;
+		}
+		if (journal->j_fc_cleanup_callback)
+			journal->j_fc_cleanup_callback(journal);
+		write_lock(&journal->j_state_lock);
+		journal->j_fc_off = 0;
+		journal->j_subtid = 0;
+		journal->j_do_full_commit = false;
+		write_unlock(&journal->j_state_lock);
+	}
+
+	jbd_debug(3, "fast commit not performed, trying full.\n");
+	if (fc)
+		*fc = false;
+
 	commit_transaction = journal->j_running_transaction;
 
 	trace_jbd2_start_commit(journal, commit_transaction);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 59ad709154a3..6fad7a15ea82 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -160,7 +160,13 @@ static void commit_timeout(struct timer_list *t)
  *
  * 1) COMMIT:  Every so often we need to commit the current state of the
  *    filesystem to disk.  The journal thread is responsible for writing
- *    all of the metadata buffers to disk.
+ *    all of the metadata buffers to disk. If fast commits are allowed,
+ *    journal thread passes the control to the file system and file system
+ *    is then responsible for writing metadata buffers to disk (in whichever
+ *    format it wants). If fast commit succeds, journal thread won't perform
+ *    a normal commit. In case the fast commit fails, journal thread performs
+ *    full commit as normal.
+ *
  *
  * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
  *    of the data in that part of the log has been rewritten elsewhere on
@@ -172,6 +178,7 @@ static int kjournald2(void *arg)
 {
 	journal_t *journal = arg;
 	transaction_t *transaction;
+	bool fc_flag = true, fc_flag_save;
 
 	/*
 	 * Set up an interval timer which can be used to trigger a commit wakeup
@@ -209,9 +216,14 @@ static int kjournald2(void *arg)
 		jbd_debug(1, "OK, requests differ\n");
 		write_unlock(&journal->j_state_lock);
 		del_timer_sync(&journal->j_commit_timer);
-		jbd2_journal_commit_transaction(journal);
+		fc_flag_save = fc_flag;
+		jbd2_journal_commit_transaction(journal, &fc_flag);
 		write_lock(&journal->j_state_lock);
-		goto loop;
+		if (!fc_flag) {
+			/* fast commit not performed */
+			fc_flag = fc_flag_save;
+			goto loop;
+		}
 	}
 
 	wake_up(&journal->j_wait_done_commit);
@@ -235,16 +247,18 @@ static int kjournald2(void *arg)
 
 		prepare_to_wait(&journal->j_wait_commit, &wait,
 				TASK_INTERRUPTIBLE);
-		if (journal->j_commit_sequence != journal->j_commit_request)
+		if (!fc_flag &&
+		    journal->j_commit_sequence != journal->j_commit_request)
 			should_sleep = 0;
 		transaction = journal->j_running_transaction;
 		if (transaction && time_after_eq(jiffies,
-						transaction->t_expires))
+						 transaction->t_expires))
 			should_sleep = 0;
 		if (journal->j_flags & JBD2_UNMOUNT)
 			should_sleep = 0;
 		if (should_sleep) {
 			write_unlock(&journal->j_state_lock);
+			jbd_debug(1, "%s sleeps\n", __func__);
 			schedule();
 			write_lock(&journal->j_state_lock);
 		}
@@ -259,7 +273,10 @@ static int kjournald2(void *arg)
 	transaction = journal->j_running_transaction;
 	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
 		journal->j_commit_request = transaction->t_tid;
+		fc_flag = false;
 		jbd_debug(1, "woke because of timeout\n");
+	} else {
+		fc_flag = true;
 	}
 	goto loop;
 
@@ -517,11 +534,17 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 	return 0;
 }
 
-int jbd2_log_start_commit(journal_t *journal, tid_t tid)
+int jbd2_log_start_commit(journal_t *journal, tid_t tid, bool full_commit)
 {
 	int ret;
 
 	write_lock(&journal->j_state_lock);
+	/*
+	 * If someone has already requested a full commit,
+	 * we have to honor it.
+	 */
+	if (!journal->j_do_full_commit)
+		journal->j_do_full_commit = full_commit;
 	ret = __jbd2_log_start_commit(journal, tid);
 	write_unlock(&journal->j_state_lock);
 	return ret;
@@ -556,7 +579,7 @@ static int __jbd2_journal_force_commit(journal_t *journal)
 	tid = transaction->t_tid;
 	read_unlock(&journal->j_state_lock);
 	if (need_to_start)
-		jbd2_log_start_commit(journal, tid);
+		jbd2_log_start_commit(journal, tid, true);
 	ret = jbd2_log_wait_commit(journal, tid);
 	if (!ret)
 		ret = 1;
@@ -603,11 +626,14 @@ int jbd2_journal_force_commit(journal_t *journal)
  * if a transaction is going to be committed (or is currently already
  * committing), and fills its tid in at *ptid
  */
-int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
+int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid, bool full_commit)
 {
 	int ret = 0;
 
 	write_lock(&journal->j_state_lock);
+	if (!journal->j_do_full_commit)
+		journal->j_do_full_commit = full_commit;
+
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
@@ -675,7 +701,7 @@ EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
  * Wait for a specified commit to complete.
  * The caller may not hold the journal lock.
  */
-int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
+int __jbd2_log_wait_commit(journal_t *journal, tid_t tid, tid_t subtid)
 {
 	int err = 0;
 
@@ -702,12 +728,25 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	}
 #endif
 	while (tid_gt(tid, journal->j_commit_sequence)) {
-		jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
-				  tid, journal->j_commit_sequence);
+		if ((!journal->j_do_full_commit) &&
+		    !tid_geq(subtid, journal->j_subtid))
+			break;
+		jbd_debug(1, "JBD2: want full commit %u %s %u, ",
+			  tid, journal->j_do_full_commit ?
+			  "and ignoring fast commit request for " :
+			  "or want fast commit",
+			  journal->j_subtid);
+		jbd_debug(1, "j_commit_sequence=%u, j_subtid=%u\n",
+			  journal->j_commit_sequence, journal->j_subtid);
 		read_unlock(&journal->j_state_lock);
 		wake_up(&journal->j_wait_commit);
-		wait_event(journal->j_wait_done_commit,
-				!tid_gt(tid, journal->j_commit_sequence));
+		if (journal->j_do_full_commit)
+			wait_event(journal->j_wait_done_commit,
+				   !tid_gt(tid, journal->j_commit_sequence));
+		else
+			wait_event(journal->j_wait_done_commit,
+				   !tid_gt(tid, journal->j_commit_sequence) ||
+				   !tid_geq(subtid, journal->j_subtid));
 		read_lock(&journal->j_state_lock);
 	}
 	read_unlock(&journal->j_state_lock);
@@ -717,6 +756,13 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
+{
+	journal->j_do_full_commit = true;
+	return __jbd2_log_wait_commit(journal, tid, 0);
+}
+
+
 /* Return 1 when transaction with given tid has already committed. */
 int jbd2_transaction_committed(journal_t *journal, tid_t tid)
 {
@@ -751,7 +797,7 @@ int jbd2_complete_transaction(journal_t *journal, tid_t tid)
 		if (journal->j_commit_request != tid) {
 			/* transaction not yet started, so request it */
 			read_unlock(&journal->j_state_lock);
-			jbd2_log_start_commit(journal, tid);
+			jbd2_log_start_commit(journal, tid, true);
 			goto wait_commit;
 		}
 	} else if (!(journal->j_committing_transaction &&
@@ -1741,7 +1787,7 @@ int jbd2_journal_destroy(journal_t *journal)
 
 	/* Force a final log commit */
 	if (journal->j_running_transaction)
-		jbd2_journal_commit_transaction(journal);
+		jbd2_journal_commit_transaction(journal, NULL);
 
 	/* Force any old transactions to disk */
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 990e7b5062e7..87f6627d78aa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -154,7 +154,7 @@ static void wait_transaction_locked(journal_t *journal)
 	need_to_start = !tid_geq(journal->j_commit_request, tid);
 	read_unlock(&journal->j_state_lock);
 	if (need_to_start)
-		jbd2_log_start_commit(journal, tid);
+		jbd2_log_start_commit(journal, tid, true);
 	jbd2_might_wait_for_commit(journal);
 	schedule();
 	finish_wait(&journal->j_wait_transaction_locked, &wait);
@@ -708,7 +708,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 	need_to_start = !tid_geq(journal->j_commit_request, tid);
 	read_unlock(&journal->j_state_lock);
 	if (need_to_start)
-		jbd2_log_start_commit(journal, tid);
+		jbd2_log_start_commit(journal, tid, true);
 
 	rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
 	handle->h_buffer_credits = nblocks;
@@ -1822,7 +1822,7 @@ int jbd2_journal_stop(handle_t *handle)
 		jbd_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
-		jbd2_log_start_commit(journal, transaction->t_tid);
+		jbd2_log_start_commit(journal, transaction->t_tid, true);
 
 		/*
 		 * Special case: JBD2_SYNC synchronous updates require us
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0c335b51043d..df41c43573b7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6117,7 +6117,7 @@ int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+	if (jbd2_journal_start_commit(osb->journal->j_journal, &target, true)) {
 		jbd2_log_wait_commit(osb->journal->j_journal, target);
 		ret = 1;
 	}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8b2f39506648..60ecc51759ae 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -410,7 +410,7 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 	}
 
 	if (jbd2_journal_start_commit(osb->journal->j_journal,
-				      &target)) {
+				      &target, true)) {
 		if (wait)
 			jbd2_log_wait_commit(osb->journal->j_journal,
 					     target);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 389bc7cd2410..7f832283e4b9 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1363,7 +1363,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 
 /* Commit management */
-extern void jbd2_journal_commit_transaction(journal_t *);
+extern void jbd2_journal_commit_transaction(journal_t *journal,
+					    bool *full_commit);
 
 /* Checkpoint list management */
 void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
@@ -1570,9 +1571,10 @@ extern void	jbd2_clear_buffer_revoked_flags(journal_t *journal);
  * transitions on demand.
  */
 
-int jbd2_log_start_commit(journal_t *journal, tid_t tid);
+int jbd2_log_start_commit(journal_t *journal, tid_t tid, bool full_commit);
 int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
-int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
+int jbd2_journal_start_commit(journal_t *journal, tid_t *tid,
+			      bool full_commit);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
 int jbd2_transaction_committed(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 05/11] jbd2: fast-commit commit path new APIs
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (2 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 04/11] jbd2: fast-commit commit path changes Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22 17:45   ` kbuild test robot
  2019-07-22 21:02   ` kbuild test robot
  2019-07-22  4:00 ` [PATCH 06/11] jbd2: fast-commit recovery path changes Harshad Shirwadkar
                   ` (6 subsequent siblings)
  10 siblings, 2 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

This patch adds new helper APIs that ext4 needs for fast
commits. These new fast commit APIs are used by subsequent fast commit
patches to implement fast commits. Following new APIs are added:

/*
 * Returns when either a full commit or a fast commit
 * completes
 */
int jbd2_fc_complete_commit(journal_tc *journal, tid_t tid,
			    tid_t tid, tid_t subtid)

/* Send all the data buffers related to an inode */
int journal_submit_inode_data(journal_t *journal,
			      struct jbd2_inode *jinode)

/* Map one fast commit buffer for use by the file system */
int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)

/* Submit all mapped fast commit buffers */
jbd2_submit_fc_bufs(journal_t *journal, bh_end_io_t *end_io)

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/jbd2/commit.c     |  31 +++++++++++++
 fs/jbd2/journal.c    | 107 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h |   6 +++
 3 files changed, 144 insertions(+)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 241581c23aa3..1713aebcf38d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -202,6 +202,37 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping,
 	return ret;
 }
 
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
+{
+	struct address_space *mapping;
+	loff_t dirty_start = jinode->i_dirty_start;
+	loff_t dirty_end = jinode->i_dirty_end;
+	int ret;
+
+	if (!jinode)
+		return 0;
+
+	if (!(jinode->i_flags & JI_WRITE_DATA))
+		return 0;
+
+	dirty_start = jinode->i_dirty_start;
+	dirty_end = jinode->i_dirty_end;
+
+	mapping = jinode->i_vfs_inode->i_mapping;
+	jinode->i_flags |= JI_COMMIT_RUNNING;
+
+	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
+	ret = journal_submit_inode_data_buffers(mapping, dirty_start,
+						dirty_end);
+
+	jinode->i_flags &= ~JI_COMMIT_RUNNING;
+	/* Protect JI_COMMIT_RUNNING flag */
+	smp_mb();
+	wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+
+	return ret;
+}
+
 /*
  * Submit all the data buffers of inode associated with the transaction to
  * disk.
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 6fad7a15ea82..cbe6e72a25e6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -811,6 +811,33 @@ int jbd2_complete_transaction(journal_t *journal, tid_t tid)
 }
 EXPORT_SYMBOL(jbd2_complete_transaction);
 
+int jbd2_fc_complete_commit(journal_t *journal, tid_t tid, tid_t subtid)
+{
+	int	need_to_wait = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid) {
+		/* Check if fast commit was already done */
+		if (journal->j_subtid > subtid)
+			need_to_wait = 0;
+		if (journal->j_commit_request != tid) {
+			/* transaction not yet started, so request it */
+			read_unlock(&journal->j_state_lock);
+			jbd2_log_start_commit(journal, tid, false);
+			goto wait_commit;
+		}
+	} else if (!(journal->j_committing_transaction &&
+		     journal->j_committing_transaction->t_tid == tid))
+		need_to_wait = 0;
+	read_unlock(&journal->j_state_lock);
+	if (!need_to_wait)
+		return 0;
+wait_commit:
+	return __jbd2_log_wait_commit(journal, tid, subtid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
 /*
  * Log buffer allocation routines:
  */
@@ -831,6 +858,86 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 	return jbd2_journal_bmap(journal, blocknr, retp);
 }
 
+int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out)
+{
+	unsigned long long pblock;
+	unsigned long blocknr;
+	int ret = 0;
+	struct buffer_head *bh;
+	int fc_off;
+	journal_header_t *jhdr;
+
+	write_lock(&journal->j_state_lock);
+
+	if (journal->j_fc_off + journal->j_first_fc < journal->j_last_fc) {
+		fc_off = journal->j_fc_off;
+		blocknr = journal->j_first_fc + fc_off;
+		journal->j_fc_off++;
+	} else {
+		ret = -EINVAL;
+	}
+	write_unlock(&journal->j_state_lock);
+
+	if (ret)
+		return ret;
+
+	ret = jbd2_journal_bmap(journal, blocknr, &pblock);
+	if (ret)
+		return ret;
+
+	bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
+	if (!bh)
+		return -ENOMEM;
+
+	lock_buffer(bh);
+	jhdr = (journal_header_t *)bh->b_data;
+	jhdr->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+	jhdr->h_blocktype = cpu_to_be32(JBD2_FC_BLOCK);
+	jhdr->h_sequence = cpu_to_be32(journal->j_running_transaction->t_tid);
+
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	journal->j_fc_wbuf[fc_off] = bh;
+
+	*bh_out = bh;
+
+	return 0;
+}
+
+int jbd2_submit_fc_bufs(journal_t *journal, bh_end_io_t *end_io)
+{
+	struct buffer_head *bh;
+	int ret, i, j_fc_off;
+
+	read_lock(&journal->j_state_lock);
+	j_fc_off = journal->j_fc_off;
+	read_unlock(&journal->j_state_lock);
+
+	for (i = 0; i < j_fc_off; i++) {
+		bh = journal->j_fc_wbuf[i];
+
+		if (!bh)
+			return -ENOMEM;
+
+		lock_buffer(bh);
+		clear_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		bh->b_end_io = end_io;
+		if (journal->j_flags & JBD2_BARRIER &&
+		    !jbd2_has_feature_async_commit(journal))
+			ret = submit_bh(REQ_OP_WRITE,
+					REQ_SYNC | REQ_PREFLUSH | REQ_FUA,
+					bh);
+		else
+			ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+		wait_on_buffer(bh);
+		if (ret || !buffer_uptodate(bh))
+			return -EIO;
+	}
+
+	return 0;
+}
+
 /*
  * Conversion of logical to physical block numbers for the journal
  *
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 7f832283e4b9..1bf979b7cdc0 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -124,6 +124,7 @@ typedef struct journal_s	journal_t;	/* Journal control structure */
 #define JBD2_SUPERBLOCK_V1	3
 #define JBD2_SUPERBLOCK_V2	4
 #define JBD2_REVOKE_BLOCK	5
+#define JBD2_FC_BLOCK		6
 
 /*
  * Standard header for all descriptor blocks:
@@ -1580,6 +1581,7 @@ int jbd2_transaction_committed(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
+int jbd2_fc_complete_commit(journal_t *journal, tid_t tid, tid_t subtid);
 
 void __jbd2_log_wait_for_space(journal_t *journal);
 extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
@@ -1730,6 +1732,10 @@ static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
 	return tid;
 }
 
+int jbd2_map_fc_buf(journal_t *journal, struct buffer_head **bh_out);
+int jbd2_submit_fc_bufs(journal_t *journal, bh_end_io_t *end_io);
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
+
 #ifdef __KERNEL__
 
 #define buffer_trace_init(bh)	do {} while (0)
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 06/11] jbd2: fast-commit recovery path changes
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (3 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 05/11] jbd2: fast-commit commit path new APIs Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 07/11] ext4: add fields that are needed to track changed files Harshad Shirwadkar
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

This patch adds fast-commit recovery path changes for JBD2. If we find
a fast commit block that is valid in our recovery phase call file
system specific routine to handle that block.

We also clear the fast commit flag in jbd2_mark_journal_empty() which
is called after successful recovery as well successful
checkpointing. This allows JBD2 journal to be compatible with older
versions when there are not fast commit blocks.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/jbd2/journal.c  | 12 ++++++++++
 fs/jbd2/recovery.c | 60 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index cbe6e72a25e6..48214209714e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1608,6 +1608,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 {
 	journal_superblock_t *sb = journal->j_superblock;
+	bool had_fast_commit = false;
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 	lock_buffer(journal->j_sb_buffer);
@@ -1621,6 +1622,14 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
 	sb->s_start    = cpu_to_be32(0);
+	if (jbd2_has_feature_fast_commit(journal)) {
+		/*
+		 * When journal is clean, no need to commit fast commit flag and
+		 * make file system incompatible with older kernels.
+		 */
+		jbd2_clear_feature_fast_commit(journal);
+		had_fast_commit = true;
+	}
 
 	jbd2_write_superblock(journal, write_op);
 
@@ -1628,6 +1637,9 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 	write_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_FLUSHED;
 	write_unlock(&journal->j_state_lock);
+
+	if (had_fast_commit)
+		jbd2_set_feature_fast_commit(journal);
 }
 
 
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index a4967b27ffb6..d900f7b1acff 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,8 +225,12 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
 /* Make sure we wrap around the log correctly! */
 #define wrap(journal, var)						\
 do {									\
-	if (var >= (journal)->j_last)					\
-		var -= ((journal)->j_last - (journal)->j_first);	\
+	unsigned long _wrap_last =					\
+		jbd2_has_feature_fast_commit(journal) ?			\
+			(journal)->j_last_fc : (journal)->j_last;	\
+									\
+	if (var >= _wrap_last)						\
+		var -= (_wrap_last - (journal)->j_first);		\
 } while (0)
 
 /**
@@ -413,6 +417,50 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
 		return tag->t_checksum == cpu_to_be16(csum32);
 }
 
+static int fc_do_one_pass(journal_t *journal,
+			  struct recovery_info *info, enum passtype pass)
+{
+	unsigned int expected_commit_id = info->end_transaction;
+	unsigned long next_fc_block;
+	struct buffer_head *bh;
+	unsigned int seq;
+	journal_header_t *jhdr;
+	int err = 0;
+
+	next_fc_block = journal->j_first_fc;
+
+	while (next_fc_block != journal->j_last_fc) {
+		jbd_debug(3, "Fast commit replay: next block %lld",
+			  next_fc_block);
+		err = jread(&bh, journal, next_fc_block);
+		if (err)
+			break;
+
+		jhdr = (journal_header_t *)bh->b_data;
+		seq = be32_to_cpu(jhdr->h_sequence);
+		if (be32_to_cpu(jhdr->h_magic) != JBD2_MAGIC_NUMBER ||
+		    seq != expected_commit_id) {
+			break;
+		} else {
+			jbd_debug(3, "Processing fast commit blk with seq %d",
+				  seq);
+			if (pass == PASS_REPLAY &&
+			    journal->j_fc_replay_callback) {
+				err = journal->j_fc_replay_callback(journal,
+								    bh);
+				if (err)
+					break;
+			}
+		}
+		next_fc_block++;
+	}
+
+	if (err)
+		jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+
+	return err;
+}
+
 static int do_one_pass(journal_t *journal,
 			struct recovery_info *info, enum passtype pass)
 {
@@ -470,7 +518,7 @@ static int do_one_pass(journal_t *journal,
 				break;
 
 		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
-			  next_commit_ID, next_log_block, journal->j_last);
+			  next_commit_ID, next_log_block, journal->j_last_fc);
 
 		/* Skip over each chunk of the transaction looking
 		 * either the next descriptor block or the final commit
@@ -768,6 +816,8 @@ static int do_one_pass(journal_t *journal,
 			if (err)
 				goto failed;
 			continue;
+		case JBD2_FC_BLOCK:
+			continue;
 
 		default:
 			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
@@ -799,6 +849,10 @@ static int do_one_pass(journal_t *journal,
 				success = -EIO;
 		}
 	}
+
+	if (jbd2_has_feature_fast_commit(journal) && pass == PASS_REPLAY)
+		fc_do_one_pass(journal, info, pass);
+
 	if (block_error && success == 0)
 		success = -EIO;
 	return success;
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 07/11] ext4: add fields that are needed to track changed files
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (4 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 06/11] jbd2: fast-commit recovery path changes Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 08/11] ext4: track changed files for fast commit Harshad Shirwadkar
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

Ext4's fast commit feature tracks changed files and maintains them in
a queue. We also remember for each file the logical block range that
needs to be committed. This patch adds these fields to ext4_inode_info
and ext4_sb_info and also adds initialization calls.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/ext4.h      | 34 ++++++++++++++++++++++++++++++++++
 fs/ext4/ext4_jbd2.c | 13 +++++++++++++
 fs/ext4/ext4_jbd2.h |  2 ++
 fs/ext4/inode.c     |  1 +
 fs/ext4/super.c     |  7 +++++++
 5 files changed, 57 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index becbda38b7db..92dc4432c7ed 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -921,6 +921,27 @@ enum {
 	I_DATA_SEM_QUOTA,
 };
 
+/*
+ * Ext4 fast commit inode specific information
+ */
+struct ext4_fast_commit_inode_info {
+	/* TID / SUB-TID when old_i_size and i_size were recorded */
+	tid_t fc_tid;
+	tid_t fc_subtid;
+
+	/*
+	 * Start of logical block range that needs to be committed in this fast
+	 * commit
+	 */
+	loff_t fc_lblk_start;
+
+	/*
+	 * End of logical block range that needs to be committed in this fast
+	 * commit
+	 */
+	loff_t fc_lblk_end;
+};
+
 
 /*
  * fourth extended file system inode data in memory
@@ -955,6 +976,9 @@ struct ext4_inode_info {
 
 	struct list_head i_orphan;	/* unlinked but open inodes */
 
+	struct list_head i_fc_list;	/* inodes that need fast commit */
+	struct ext4_fast_commit_inode_info i_fc;
+
 	/*
 	 * i_disksize keeps track of what the inode size is ON DISK, not
 	 * in memory.  During truncate, i_size is set to the new size by
@@ -1529,6 +1553,16 @@ struct ext4_sb_info {
 	/* Barrier between changing inodes' journal flags and writepages ops. */
 	struct percpu_rw_semaphore s_journal_flag_rwsem;
 	struct dax_device *s_daxdev;
+
+	/* Ext4 fast commit stuff */
+	bool fc_replay;			/* Fast commit replay in progress */
+	struct list_head s_fc_q;	/* Inodes that need fast commit. */
+	__u32 s_fc_q_cnt;		/* Number of inodes in the fc queue */
+	bool s_fc_eligible;		/*
+					 * Are changes after the last commit
+					 * eligible for fast commit?
+					 */
+	struct mutex s_fc_lock;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7c70b08d104c..75b6db808837 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -330,3 +330,16 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
 		mark_buffer_dirty(bh);
 	return err;
 }
+
+void ext4_init_inode_fc_info(struct inode *inode)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	memset(&ei->i_fc, 0, sizeof(ei->i_fc));
+	if (ext4_handle_valid(handle)) {
+		ei->i_fc.fc_tid = handle->h_transaction->t_tid;
+		ei->i_fc.fc_subtid = handle->h_transaction->t_journal->j_subtid;
+	}
+	INIT_LIST_HEAD(&ei->i_fc_list);
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index ef8fcf7d0d3b..2305c1acd415 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -459,4 +459,6 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
 	return 1;
 }
 
+void ext4_init_inode_fc_info(struct inode *inode);
+
 #endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3deed39..f230a888eddd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4996,6 +4996,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	for (block = 0; block < EXT4_N_BLOCKS; block++)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
+	ext4_init_inode_fc_info(&ei->vfs_inode);
 
 	/*
 	 * Set transaction id's of transactions that have to be committed
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6bab59ae81f7..7b4b35e940a1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1100,6 +1100,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_unwritten, 0);
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+	ext4_init_inode_fc_info(&ei->vfs_inode);
 	return &ei->vfs_inode;
 }
 
@@ -1139,6 +1140,7 @@ static void init_once(void *foo)
 	init_rwsem(&ei->i_data_sem);
 	init_rwsem(&ei->i_mmap_sem);
 	inode_init_once(&ei->vfs_inode);
+	ext4_init_inode_fc_info(&ei->vfs_inode);
 }
 
 static int __init init_inodecache(void)
@@ -4301,6 +4303,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 
+	INIT_LIST_HEAD(&sbi->s_fc_q);
+	sbi->s_fc_q_cnt = 0;
+	sbi->s_fc_eligible = true;
+	mutex_init(&sbi->s_fc_lock);
+
 	sb->s_root = NULL;
 
 	needs_recovery = (es->s_last_orphan != 0 ||
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 08/11] ext4: track changed files for fast commit
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (5 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 07/11] ext4: add fields that are needed to track changed files Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 09/11] ext4: fast-commit commit range tracking Harshad Shirwadkar
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

For fast commit, we need to remember all the files that have changed
since last fast commit / full commit. For changes that are fast commit
incompatible, we mark the file system fast commit incompatible. This
patch adds code to either remember files that have changed or to mark
ext4 as fast commit ineligible. We inspect every ext4_mark_inode_dirty
calls and decide whether that particular file change is fast
compatible or not.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/acl.c       |  1 +
 fs/ext4/ext4_jbd2.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4_jbd2.h | 25 ++++++++++++++++++++++++
 fs/ext4/extents.c   | 17 +++++++++++++++--
 fs/ext4/ialloc.c    |  1 +
 fs/ext4/inline.c    | 12 ++++++++++++
 fs/ext4/inode.c     | 30 +++++++++++++++++++++++++++--
 fs/ext4/ioctl.c     |  3 +++
 fs/ext4/migrate.c   |  1 +
 fs/ext4/namei.c     | 14 +++++++++++++-
 fs/ext4/super.c     | 15 +++++++++++++++
 fs/ext4/xattr.c     |  1 +
 12 files changed, 161 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8c7bbf3e566d..e84be9c315db 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -257,6 +257,7 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 		inode->i_mode = mode;
 		inode->i_ctime = current_time(inode);
 		ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_enqueue_inode(handle, inode);
 	}
 out_stop:
 	ext4_journal_stop(handle);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 75b6db808837..b32b42076318 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -343,3 +343,49 @@ void ext4_init_inode_fc_info(struct inode *inode)
 	}
 	INIT_LIST_HEAD(&ei->i_fc_list);
 }
+
+void ext4_fc_enqueue_inode(handle_t *handle, struct inode *inode)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (!ext4_should_fast_commit(inode->i_sb))
+		return;
+
+	mutex_lock(&sbi->s_fc_lock);
+	if (!sbi->s_fc_eligible) {
+		mutex_unlock(&sbi->s_fc_lock);
+		return;
+	}
+	if (list_empty(&EXT4_I(inode)->i_fc_list)) {
+		list_add(&EXT4_I(inode)->i_fc_list, &sbi->s_fc_q);
+		sbi->s_fc_q_cnt++;
+	}
+	mutex_unlock(&sbi->s_fc_lock);
+
+	if (!ext4_handle_valid(handle))
+		return;
+
+	if (ei->i_fc.fc_tid == handle->h_transaction->t_tid &&
+	    ei->i_fc.fc_subtid ==
+	    handle->h_transaction->t_journal->j_subtid)
+		return;
+
+	ei->i_fc.fc_lblk_start = i_size_read(inode);
+	ei->i_fc.fc_lblk_end = i_size_read(inode);
+	ei->i_fc.fc_subtid = handle->h_transaction->t_journal->j_subtid;
+	ei->i_fc.fc_tid = handle->h_transaction->t_tid;
+}
+
+void ext4_fc_del(struct inode *inode)
+{
+	if (!ext4_should_fast_commit(inode->i_sb))
+		return;
+
+	if (list_empty(&EXT4_I(inode)->i_fc_list))
+		return;
+
+	mutex_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+	list_del_init(&EXT4_I(inode)->i_fc_list);
+	mutex_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 2305c1acd415..130297255624 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -459,6 +459,31 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
 	return 1;
 }
 
+static inline int ext4_should_fast_commit(struct super_block *sb)
+{
+	if (!ext4_has_feature_fast_commit(sb))
+		return 0;
+	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+		return 0;
+	if (test_opt(sb, QUOTA))
+		return 0;
+	return 1;
+}
+
 void ext4_init_inode_fc_info(struct inode *inode);
+extern void ext4_fc_enqueue_inode(handle_t *handle,
+					       struct inode *inode);
+extern void ext4_fc_del(struct inode *inode);
+
+static inline void
+ext4_fc_mark_ineligible(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	mutex_lock(&sbi->s_fc_lock);
+	sbi->s_fc_eligible = false;
+	mutex_unlock(&sbi->s_fc_lock);
+}
+
 
 #endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92266a2da7d6..eb77e306a82b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -163,6 +163,7 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
 	} else {
 		/* path points to leaf/index in inode body */
 		err = ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_enqueue_inode(handle, inode);
 	}
 	return err;
 }
@@ -1371,6 +1372,7 @@ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
 	struct ext4_ext_path *curp;
 	int depth, i, err = 0;
 
+	ext4_fc_mark_ineligible(inode->i_sb);
 repeat:
 	i = depth = ext_depth(inode);
 
@@ -3714,6 +3716,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		err = ext4_zeroout_es(inode, &zero_ex1);
 		if (!err)
 			err = ext4_zeroout_es(inode, &zero_ex2);
+	} else {
+		ext4_fc_mark_ineligible(inode->i_sb);
 	}
 	return err ? err : allocated;
 }
@@ -3856,7 +3860,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 			      struct ext4_ext_path *path,
 			      unsigned int len)
 {
-	int i, depth;
+	int i, ret, depth;
 	struct ext4_extent_header *eh;
 	struct ext4_extent *last_ex;
 
@@ -3898,7 +3902,10 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 			return 0;
 out:
 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-	return ext4_mark_inode_dirty(handle, inode);
+	ret = ext4_mark_inode_dirty(handle, inode);
+	ext4_fc_enqueue_inode(handle, inode);
+
+	return ret;
 }
 
 static int
@@ -4607,6 +4614,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 				   inode->i_ino, map.m_lblk,
 				   map.m_len, ret);
 			ext4_mark_inode_dirty(handle, inode);
+			ext4_fc_enqueue_inode(handle, inode);
 			ret2 = ext4_journal_stop(handle);
 			break;
 		}
@@ -4624,6 +4632,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 				ext4_set_inode_flag(inode,
 						    EXT4_INODE_EOFBLOCKS);
 		}
+		ext4_fc_enqueue_inode(handle, inode);
 		ext4_mark_inode_dirty(handle, inode);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 		ret2 = ext4_journal_stop(handle);
@@ -4786,6 +4795,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 	ext4_mark_inode_dirty(handle, inode);
+	ext4_fc_enqueue_inode(handle, inode);
 
 	/* Zero out partial block at the edges of the range */
 	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
@@ -4957,6 +4967,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
 				     "ext4_ext_map_blocks returned %d",
 				     inode->i_ino, map.m_lblk,
 				     map.m_len, ret);
+		ext4_fc_mark_ineligible(inode->i_sb);
 		ext4_mark_inode_dirty(handle, inode);
 		if (credits)
 			ret2 = ext4_journal_stop(handle);
@@ -5485,6 +5496,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 	inode->i_mtime = inode->i_ctime = current_time(inode);
+	ext4_fc_mark_ineligible(inode->i_sb);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 
@@ -5599,6 +5611,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	inode->i_size += len;
 	EXT4_I(inode)->i_disksize += len;
 	inode->i_mtime = inode->i_ctime = current_time(inode);
+	ext4_fc_mark_ineligible(inode->i_sb);
 	ret = ext4_mark_inode_dirty(handle, inode);
 	if (ret)
 		goto out_stop;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 764ff4c56233..97a9882a3363 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1175,6 +1175,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		ei->i_datasync_tid = handle->h_transaction->t_tid;
 	}
 
+	ext4_fc_mark_ineligible(sb);
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_std_error(sb, err);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 88cdf3c90bd1..190968996bc6 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -435,6 +435,8 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
 	if (error)
 		goto out;
 
+	ext4_fc_mark_ineligible(inode->i_sb);
+
 	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
 		0, EXT4_MIN_INLINE_DATA_SIZE);
 	memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE);
@@ -759,6 +761,8 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 
 	ext4_write_unlock_xattr(inode, &no_expand);
 	brelse(iloc.bh);
+	ext4_fc_enqueue_inode(ext4_journal_current_handle(),
+					   inode);
 	mark_inode_dirty(inode);
 out:
 	return copied;
@@ -974,6 +978,8 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
+	ext4_fc_enqueue_inode(ext4_journal_current_handle(),
+					   inode);
 	mark_inode_dirty(inode);
 
 	return copied;
@@ -1165,6 +1171,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
 	if (err)
 		return err;
 	set_buffer_verified(dir_block);
+	ext4_fc_mark_ineligible(inode->i_sb);
 	return ext4_mark_inode_dirty(handle, inode);
 }
 
@@ -1216,6 +1223,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
 		goto out_restore;
 	}
 
+	ext4_fc_mark_ineligible(inode->i_sb);
+
 	data_bh = sb_getblk(inode->i_sb, map.m_pblk);
 	if (!data_bh) {
 		error = -ENOMEM;
@@ -1709,6 +1718,8 @@ int ext4_delete_inline_entry(handle_t *handle,
 	if (err)
 		goto out;
 
+	ext4_fc_enqueue_inode(handle, dir);
+
 	ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
 out:
 	ext4_write_unlock_xattr(dir, &no_expand);
@@ -1986,6 +1997,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 
 	if (err == 0) {
 		inode->i_mtime = inode->i_ctime = current_time(inode);
+		ext4_fc_enqueue_inode(handle, inode);
 		err = ext4_mark_inode_dirty(handle, inode);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f230a888eddd..379e911b48c4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -262,6 +262,7 @@ void ext4_evict_inode(struct inode *inode)
 		 * cleaned up.
 		 */
 		ext4_orphan_del(NULL, inode);
+		ext4_fc_del(inode);
 		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
@@ -279,6 +280,8 @@ void ext4_evict_inode(struct inode *inode)
 	if (ext4_inode_is_fast_symlink(inode))
 		memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
 	inode->i_size = 0;
+	ext4_fc_del(inode);
+	ext4_fc_mark_ineligible(inode->i_sb);
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_warning(inode->i_sb,
@@ -303,6 +306,7 @@ void ext4_evict_inode(struct inode *inode)
 stop_handle:
 		ext4_journal_stop(handle);
 		ext4_orphan_del(NULL, inode);
+		ext4_fc_del(inode);
 		sb_end_intwrite(inode->i_sb);
 		ext4_xattr_inode_array_free(ea_inode_array);
 		goto no_delete;
@@ -326,6 +330,8 @@ void ext4_evict_inode(struct inode *inode)
 	 * having errors), but we can't free the inode if the mark_dirty
 	 * fails.
 	 */
+	ext4_fc_del(inode);
+	ext4_fc_mark_ineligible(inode->i_sb);
 	if (ext4_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
 		ext4_clear_inode(inode);
@@ -1436,8 +1442,10 @@ static int ext4_write_end(struct file *file,
 	 * ordering of page lock and transaction start for journaling
 	 * filesystems.
 	 */
-	if (i_size_changed || inline_data)
+	if (i_size_changed || inline_data) {
 		ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_enqueue_inode(handle, inode);
+	}
 
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
@@ -1550,6 +1558,7 @@ static int ext4_journalled_write_end(struct file *file,
 		pagecache_isize_extended(inode, old_size, pos);
 
 	if (size_changed || inline_data) {
+		ext4_fc_enqueue_inode(handle, inode);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
@@ -2077,6 +2086,7 @@ static int __ext4_journalled_writepage(struct page *page,
 
 	if (inline_data) {
 		ret = ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_enqueue_inode(handle, inode);
 	} else {
 		ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
 					     do_journal_get_write_access);
@@ -2604,6 +2614,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 			EXT4_I(inode)->i_disksize = disksize;
 		up_write(&EXT4_I(inode)->i_data_sem);
 		err2 = ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_enqueue_inode(handle, inode);
 		if (err2)
 			ext4_error(inode->i_sb,
 				   "Failed to mark inode %lu dirty",
@@ -3205,6 +3216,7 @@ static int ext4_da_write_end(struct file *file,
 			 * bu greater than i_disksize.(hint delalloc)
 			 */
 			ext4_mark_inode_dirty(handle, inode);
+			ext4_fc_enqueue_inode(handle, inode);
 		}
 	}
 
@@ -3614,8 +3626,12 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 		ret = PTR_ERR(handle);
 		goto orphan_del;
 	}
-	if (ext4_update_inode_size(inode, offset + written))
+
+	if (ext4_update_inode_size(inode, offset + written)) {
 		ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_enqueue_inode(handle, inode);
+	}
+
 	/*
 	 * We may need to truncate allocated but not written blocks beyond EOF.
 	 */
@@ -3851,6 +3867,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 				 * ignore it.
 				 */
 				ext4_mark_inode_dirty(handle, inode);
+				ext4_fc_enqueue_inode(handle, inode);
 			}
 		}
 		err = ext4_journal_stop(handle);
@@ -4372,6 +4389,8 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 		goto out_dio;
 	}
 
+	ext4_fc_mark_ineligible(inode->i_sb);
+
 	ret = ext4_zero_partial_blocks(handle, inode, offset,
 				       length);
 	if (ret)
@@ -4525,6 +4544,7 @@ int ext4_truncate(struct inode *inode)
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
 		ext4_block_truncate_page(handle, mapping, inode->i_size);
 
+	ext4_fc_mark_ineligible(inode->i_sb);
 	/*
 	 * We add the inode to the orphan list, so that if this
 	 * truncate spans multiple transactions, and we crash, we will
@@ -5593,6 +5613,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		error = ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_enqueue_inode(handle, inode);
 		ext4_journal_stop(handle);
 	}
 
@@ -5653,6 +5674,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				inode->i_mtime = current_time(inode);
 				inode->i_ctime = inode->i_mtime;
 			}
+			ext4_fc_enqueue_inode(handle, inode);
 			down_write(&EXT4_I(inode)->i_data_sem);
 			EXT4_I(inode)->i_disksize = attr->ia_size;
 			rc = ext4_mark_inode_dirty(handle, inode);
@@ -5697,6 +5719,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (!error) {
 		setattr_copy(inode, attr);
+		ext4_fc_enqueue_inode(ext4_journal_current_handle(),
+						   inode);
 		mark_inode_dirty(inode);
 	}
 
@@ -6109,6 +6133,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)
 		goto out;
 
 	ext4_mark_inode_dirty(handle, inode);
+	ext4_fc_enqueue_inode(handle, inode);
 
 	ext4_journal_stop(handle);
 out:
@@ -6194,6 +6219,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
+	ext4_fc_mark_ineligible(inode->i_sb);
 	err = ext4_mark_inode_dirty(handle, inode);
 	ext4_handle_sync(handle);
 	ext4_journal_stop(handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 442f7ef873fc..c676fa118414 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -987,6 +987,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		err = mnt_want_write_file(filp);
 		if (err)
 			return err;
+		ext4_fc_mark_ineligible(sb);
 		err = swap_inode_boot_loader(sb, inode);
 		mnt_drop_write_file(filp);
 		return err;
@@ -997,6 +998,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		int err = 0, err2 = 0;
 		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
 
+		ext4_fc_mark_ineligible(sb);
+
 		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
 				   sizeof(__u64))) {
 			return -EFAULT;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b1e4d359f73b..b995690d73ce 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -513,6 +513,7 @@ int ext4_ext_migrate(struct inode *inode)
 		 * work to orphan_list_cleanup()
 		 */
 		ext4_orphan_del(NULL, tmp_inode);
+		ext4_fc_del(inode);
 		retval = PTR_ERR(handle);
 		goto out;
 	}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 129029534075..e77ff130c045 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2022,6 +2022,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
 	ext4_update_dx_flag(dir);
 	inode_inc_iversion(dir);
 	ext4_mark_inode_dirty(handle, dir);
+	ext4_fc_mark_ineligible(dir->i_sb);
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_dirblock(handle, dir, bh);
 	if (err)
@@ -2140,8 +2141,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
 	 * out all the changes we did so far. Otherwise we can end up
 	 * with corrupted filesystem.
 	 */
-	if (retval)
+	if (retval) {
 		ext4_mark_inode_dirty(handle, dir);
+		ext4_fc_mark_ineligible(dir->i_sb);
+	}
 	dx_release(frames);
 	brelse(bh2);
 	return retval;
@@ -2208,6 +2211,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
 		dx_fallback++;
 		ext4_mark_inode_dirty(handle, dir);
+		ext4_fc_mark_ineligible(dir->i_sb);
 	}
 	blocks = dir->i_size >> sb->s_blocksize_bits;
 	for (block = 0; block < blocks; block++) {
@@ -2553,6 +2557,7 @@ static int ext4_add_nondir(handle_t *handle,
 	int err = ext4_add_entry(handle, dentry, inode);
 	if (!err) {
 		ext4_mark_inode_dirty(handle, inode);
+		ext4_fc_mark_ineligible(inode->i_sb);
 		d_instantiate_new(dentry, inode);
 		return 0;
 	}
@@ -2661,6 +2666,7 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 		err = ext4_orphan_add(handle, inode);
 		if (err)
 			goto err_unlock_inode;
+		ext4_fc_enqueue_inode(handle, inode);
 		mark_inode_dirty(inode);
 		unlock_new_inode(inode);
 	}
@@ -2773,6 +2779,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	err = ext4_init_new_dir(handle, dir, inode);
 	if (err)
 		goto out_clear_inode;
+	ext4_fc_mark_ineligible(inode->i_sb);
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (!err)
 		err = ext4_add_entry(handle, dentry, inode);
@@ -3114,6 +3121,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_size = 0;
 	ext4_orphan_add(handle, inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+	ext4_fc_mark_ineligible(inode->i_sb);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_dec_count(handle, dir);
 	ext4_update_dx_flag(dir);
@@ -3192,6 +3200,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		goto end_unlink;
 	dir->i_ctime = dir->i_mtime = current_time(dir);
 	ext4_update_dx_flag(dir);
+	ext4_fc_mark_ineligible(dir->i_sb);
 	ext4_mark_inode_dirty(handle, dir);
 	drop_nlink(inode);
 	if (!inode->i_nlink)
@@ -3387,6 +3396,7 @@ static int ext4_link(struct dentry *old_dentry,
 
 	err = ext4_add_entry(handle, dentry, inode);
 	if (!err) {
+		ext4_fc_mark_ineligible(inode->i_sb);
 		ext4_mark_inode_dirty(handle, inode);
 		/* this can happen only for tmpfile being
 		 * linked the first time
@@ -3991,6 +4001,8 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
 	if (err)
 		return err;
 
+	ext4_fc_mark_ineligible(old_dir->i_sb);
+
 	if (flags & RENAME_EXCHANGE) {
 		return ext4_cross_rename(old_dir, old_dentry,
 					 new_dir, new_dentry);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7b4b35e940a1..f6e820384ee0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1129,6 +1129,16 @@ static void ext4_destroy_inode(struct inode *inode)
 				true);
 		dump_stack();
 	}
+	if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
+#ifdef EXT4FS_DEBUG
+		if (EXT4_SB(inode->i_sb)->s_fc_eligible) {
+			pr_warn("%s: INODE %ld in FC List with FC allowd",
+				__func__, inode->i_ino);
+			dump_stack();
+		}
+#endif
+		ext4_fc_del(inode);
+	}
 }
 
 static void init_once(void *foo)
@@ -1181,6 +1191,7 @@ void ext4_clear_inode(struct inode *inode)
 		EXT4_I(inode)->jinode = NULL;
 	}
 	fscrypt_put_encryption_info(inode);
+	ext4_fc_del(inode);
 }
 
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1325,6 +1336,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 		 * S_DAX may be disabled
 		 */
 		ext4_set_inode_flags(inode);
+		ext4_fc_mark_ineligible(inode->i_sb);
 		res = ext4_mark_inode_dirty(handle, inode);
 		if (res)
 			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
@@ -5795,6 +5807,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
 		inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
 				S_NOATIME | S_IMMUTABLE);
+		ext4_fc_mark_ineligible(inode->i_sb);
 		ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_stop(handle);
 	unlock_inode:
@@ -5902,6 +5915,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
 	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
 	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
 	inode->i_mtime = inode->i_ctime = current_time(inode);
+	ext4_fc_mark_ineligible(inode->i_sb);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 out_unlock:
@@ -6008,6 +6022,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	if (inode->i_size < off + len) {
 		i_size_write(inode, off + len);
 		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_fc_mark_ineligible(inode->i_sb);
 		ext4_mark_inode_dirty(handle, inode);
 	}
 	return len;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 491f9ee4040e..19bc4046658c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1406,6 +1406,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
 	inode_unlock(ea_inode);
 
 	ext4_mark_inode_dirty(handle, ea_inode);
+	ext4_fc_enqueue_inode(handle, ea_inode);
 
 out:
 	brelse(bh);
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 09/11] ext4: fast-commit commit range tracking
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (6 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 08/11] ext4: track changed files for fast commit Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 10/11] ext4: fast-commit commit path changes Harshad Shirwadkar
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

With this patch, we track logical range of file offsets that need to
be committed using fast commit. This allows us to find file extents
that need to be committed during the commit time.

Signed-Off-By: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/ext4_jbd2.c | 33 +++++++++++++++++++++++++++++++++
 fs/ext4/ext4_jbd2.h |  2 ++
 fs/ext4/inline.c    |  5 ++++-
 fs/ext4/inode.c     | 18 +++++++++++++++++-
 4 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b32b42076318..75754e98cc78 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -389,3 +389,36 @@ void ext4_fc_del(struct inode *inode)
 	list_del_init(&EXT4_I(inode)->i_fc_list);
 	mutex_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 }
+
+void ext4_fc_update_commit_range(handle_t *handle, struct inode *inode,
+				 loff_t start, loff_t end)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (!ext4_should_fast_commit(inode->i_sb))
+		return;
+
+	if (!ext4_handle_valid(handle))
+		return;
+
+	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb))
+		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
+
+	if (!EXT4_SB(inode->i_sb)->s_fc_eligible)
+		return;
+
+	if (ei->i_fc.fc_tid == handle->h_transaction->t_tid &&
+	    ei->i_fc.fc_subtid ==
+	    handle->h_transaction->t_journal->j_subtid) {
+		ei->i_fc.fc_lblk_start = ei->i_fc.fc_lblk_start < start ?
+					 ei->i_fc.fc_lblk_start : start;
+		ei->i_fc.fc_lblk_end = ei->i_fc.fc_lblk_end > end ?
+				     ei->i_fc.fc_lblk_end : end;
+		return;
+	}
+
+	ei->i_fc.fc_lblk_start = start;
+	ei->i_fc.fc_lblk_end = end;
+	ei->i_fc.fc_subtid = handle->h_transaction->t_journal->j_subtid;
+	ei->i_fc.fc_tid = handle->h_transaction->t_tid;
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 130297255624..d5958d4a0846 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -485,5 +485,7 @@ ext4_fc_mark_ineligible(struct super_block *sb)
 	mutex_unlock(&sbi->s_fc_lock);
 }
 
+void ext4_fc_update_commit_range(handle_t *handle, struct inode *inode,
+				 loff_t start, loff_t end);
 
 #endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 190968996bc6..de61c15e1b17 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -967,8 +967,11 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
-	if (pos+copied > inode->i_size)
+	if (pos+copied > inode->i_size) {
+		ext4_fc_update_commit_range(ext4_journal_current_handle(),
+					    inode, inode->i_size, pos + copied);
 		i_size_write(inode, pos+copied);
+	}
 	unlock_page(page);
 	put_page(page);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 379e911b48c4..f79b185c013e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1549,6 +1549,8 @@ static int ext4_journalled_write_end(struct file *file,
 			SetPageUptodate(page);
 	}
 	size_changed = ext4_update_inode_size(inode, pos + copied);
+	ext4_fc_update_commit_range(handle, inode, pos, pos + copied);
+
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
 	unlock_page(page);
@@ -2610,8 +2612,12 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 		i_size = i_size_read(inode);
 		if (disksize > i_size)
 			disksize = i_size;
-		if (disksize > EXT4_I(inode)->i_disksize)
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			ext4_fc_update_commit_range(handle, inode,
+						    EXT4_I(inode)->i_disksize,
+						    disksize);
 			EXT4_I(inode)->i_disksize = disksize;
+		}
 		up_write(&EXT4_I(inode)->i_data_sem);
 		err2 = ext4_mark_inode_dirty(handle, inode);
 		ext4_fc_enqueue_inode(handle, inode);
@@ -3220,6 +3226,8 @@ static int ext4_da_write_end(struct file *file,
 		}
 	}
 
+	ext4_fc_update_commit_range(handle, inode, pos, pos + copied);
+
 	if (write_mode != CONVERT_INLINE_DATA &&
 	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
 	    ext4_has_inline_data(inode))
@@ -3627,6 +3635,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 		goto orphan_del;
 	}
 
+	ext4_fc_update_commit_range(handle, inode, offset, offset + written);
 	if (ext4_update_inode_size(inode, offset + written)) {
 		ext4_mark_inode_dirty(handle, inode);
 		ext4_fc_enqueue_inode(handle, inode);
@@ -3751,6 +3760,8 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 		ext4_update_i_disksize(inode, inode->i_size);
 		ext4_journal_stop(handle);
 	}
+	ext4_fc_update_commit_range(journal_current_handle(), inode, offset,
+				    offset + count);
 
 	BUG_ON(iocb->private == NULL);
 
@@ -3869,6 +3880,8 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 				ext4_mark_inode_dirty(handle, inode);
 				ext4_fc_enqueue_inode(handle, inode);
 			}
+			ext4_fc_update_commit_range(handle, inode, offset,
+						    offset + end);
 		}
 		err = ext4_journal_stop(handle);
 		if (ret == 0)
@@ -5327,6 +5340,9 @@ static int ext4_do_update_inode(handle_t *handle,
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
 	if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+		ext4_fc_update_commit_range(handle, inode,
+					    ext4_isize(inode->i_sb, raw_inode),
+					    ei->i_disksize);
 		ext4_isize_set(raw_inode, ei->i_disksize);
 		need_datasync = 1;
 	}
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 10/11] ext4: fast-commit commit path changes
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (7 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 09/11] ext4: fast-commit commit range tracking Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22  4:00 ` [PATCH 11/11] ext4: fast-commit recovery " Harshad Shirwadkar
  2019-07-22 18:15 ` [PATCH 01/11] ext4: add handling for extended mount options Andreas Dilger
  10 siblings, 0 replies; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

This patch implements the actual commit path for fast commit. Based on
inodes tracked and their respective logical ranges remembered, this
patch adds code to create a fast commit block that stores extents
added to the inode. We use new JBD2 interfaces added in previous
patches in this series. The fast commit blocks that are created have
extents that _should_ be present in the file. It doesn't yet support
removing of extents, making operations such as truncate, delete fast
commit incompatible.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/ext4.h    |  26 ++++++
 fs/ext4/extents.c |   8 +-
 fs/ext4/fsync.c   |   2 +-
 fs/ext4/inode.c   |   5 +-
 fs/ext4/super.c   | 213 +++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 246 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 92dc4432c7ed..5d92a2e4f0af 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2276,6 +2276,32 @@ struct mmpd_data {
  */
 #define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
 
+/* Magic of fast commit header */
+#define EXT4_FC_MAGIC			0xE2540090
+
+struct ext4_fc_commit_hdr {
+	__le32 fc_magic;
+	/* JBD2 tid after which this fast commit should be applied */
+	__le32 fc_tid;
+	/* Sub transaction ID */
+	__le32 fc_subtid;
+	/* Length of this partial commit in terms of num blocks */
+	__le32 fc_len;
+	/* Inode number */
+	__le32 fc_ino;
+	/* ext4 inode on disk copy */
+	struct ext4_inode inode;
+	/* Csum(hdr+contents) */
+	__le32 fc_csum;
+};
+
+#define EXT4_FC_TAG_EXT		0x1	/* Extent */
+
+struct ext4_fc_tl {
+	__le16 fc_tag;
+	__le16 fc_len;
+};
+
 /*
  * Function prototypes
  */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index eb77e306a82b..66f7f4fb1612 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4899,10 +4899,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
-	if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
-		ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
-						EXT4_I(inode)->i_sync_tid);
-	}
+	if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal)
+		ret = jbd2_fc_complete_commit(
+		    EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid,
+		    journal_current_handle()->h_journal->j_subtid);
 out:
 	inode_unlock(inode);
 	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5508baa11bb6..4f783f9723c5 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -151,7 +151,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
 		needs_barrier = true;
-	ret = jbd2_complete_transaction(journal, commit_tid);
+	ret = jbd2_fc_complete_commit(journal, commit_tid, journal->j_subtid);
 	if (needs_barrier) {
 	issue_flush:
 		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f79b185c013e..dd5d39a48363 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5476,8 +5476,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
 			return 0;
 
-		err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
-						EXT4_I(inode)->i_sync_tid);
+		err = jbd2_fc_complete_commit(
+		    EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid,
+		    EXT4_SB(inode->i_sb)->s_journal->j_subtid);
 	} else {
 		struct ext4_iloc iloc;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f6e820384ee0..a291d41b91de 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -437,6 +437,214 @@ static bool system_going_down(void)
 		|| system_state == SYSTEM_RESTART;
 }
 
+static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	struct buffer_head *orig_bh = bh->b_private;
+
+	BUFFER_TRACE(bh, "");
+	if (uptodate) {
+		ext4_debug("%s: Block %lld up-to-date",
+			   __func__, bh->b_blocknr);
+		set_buffer_uptodate(bh);
+	} else {
+		ext4_debug("%s: Block %lld not up-to-date",
+			   __func__, bh->b_blocknr);
+		clear_buffer_uptodate(bh);
+	}
+	if (orig_bh) {
+		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+		/* Protect BH_Shadow bit in b_state */
+		smp_mb__after_atomic();
+		wake_up_bit(&orig_bh->b_state, BH_Shadow);
+	}
+	unlock_buffer(bh);
+}
+
+static int ext4_fc_write_inode(journal_t *journal, struct inode *inode,
+			       tid_t tid, tid_t subtid)
+{
+	loff_t old_blk_size, cur_lblk_off, new_blk_size;
+	struct super_block *sb = journal->j_private;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_commit_hdr *fc_hdr;
+	struct ext4_map_blocks map;
+	struct ext4_iloc iloc;
+	struct ext4_fc_tl tl;
+	struct ext4_extent extent;
+	struct buffer_head *bh;
+	__u8 *cur, *end;
+	int ret;
+
+	if (tid != ei->i_fc.fc_tid || subtid != ei->i_fc.fc_subtid) {
+		jbd_debug(3,
+			  "File not modified. Modified %d:%d, expected %d:%d",
+			  ei->i_fc.fc_tid, ei->i_fc.fc_subtid, tid, subtid);
+		return 0;
+	}
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		return -ECANCELED;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	ret = jbd2_map_fc_buf(journal, &bh);
+	if (ret)
+		return -ENOMEM;
+
+	end = (__u8 *)bh->b_data + journal->j_blocksize;
+
+	old_blk_size = (ei->i_fc.fc_lblk_start + sb->s_blocksize - 1) >>
+		       inode->i_blkbits;
+	new_blk_size = ei->i_fc.fc_lblk_end >> inode->i_blkbits;
+
+	jbd_debug(3, "Committing as tid = %d, subtid = %d on buffer %lld\n",
+		  tid, subtid, bh->b_blocknr);
+
+	ei->i_fc.fc_lblk_start = ei->i_fc.fc_lblk_end;
+
+	fc_hdr = (struct ext4_fc_commit_hdr *)
+			((__u8 *)bh->b_data + sizeof(journal_header_t));
+	fc_hdr->fc_magic = cpu_to_le32(EXT4_FC_MAGIC);
+	fc_hdr->fc_tid = cpu_to_le32(tid);
+	fc_hdr->fc_subtid = cpu_to_le32(subtid);
+	fc_hdr->fc_len = cpu_to_le32(0x1);
+	fc_hdr->fc_ino = cpu_to_le32(inode->i_ino);
+
+	memcpy(&fc_hdr->inode, ext4_raw_inode(&iloc), EXT4_INODE_SIZE(sb));
+	cur = (__u8 *)(fc_hdr + 1);
+
+	cur_lblk_off = old_blk_size;
+	while (cur_lblk_off <= new_blk_size) {
+		map.m_lblk = cur_lblk_off;
+		map.m_len = new_blk_size - cur_lblk_off + 1;
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (!ret) {
+			cur_lblk_off += map.m_len;
+			continue;
+		}
+
+		if (map.m_flags & EXT4_MAP_UNWRITTEN)
+			return -ECANCELED;
+		extent.ee_block = cpu_to_le32(map.m_lblk);
+		cur_lblk_off += map.m_len;
+		if (cur + sizeof(struct ext4_extent) +
+		    sizeof(struct ext4_fc_tl) >= end)
+			return -ENOSPC;
+
+		tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_EXT);
+		tl.fc_len = cpu_to_le16(sizeof(struct ext4_extent));
+		extent.ee_len = cpu_to_le16(map.m_len);
+		ext4_ext_store_pblock(&extent, map.m_pblk);
+		if (map.m_flags & EXT4_MAP_UNWRITTEN)
+			ext4_ext_mark_unwritten(&extent);
+		else
+			ext4_ext_mark_initialized(&extent);
+		memcpy(cur, &tl, sizeof(struct ext4_fc_tl));
+		cur += sizeof(struct ext4_fc_tl);
+		memcpy(cur, &extent, sizeof(struct ext4_extent));
+		cur += sizeof(struct ext4_extent);
+	}
+
+	jbd_debug(3, "Created FC block for inode %ld with [%d, %d]",
+		  inode->i_ino, tid, subtid);
+
+	return 1;
+}
+
+static void ext4_journal_fc_cleanup_cb(journal_t *journal)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_inode_info *iter;
+	struct inode *inode;
+
+	mutex_lock(&sbi->s_fc_lock);
+	while (!list_empty(&sbi->s_fc_q)) {
+		iter = list_first_entry(&sbi->s_fc_q,
+				  struct ext4_inode_info, i_fc_list);
+		list_del_init(&iter->i_fc_list);
+		inode = &iter->vfs_inode;
+	}
+	INIT_LIST_HEAD(&sbi->s_fc_q);
+	sbi->s_fc_q_cnt = 0;
+	mutex_unlock(&sbi->s_fc_lock);
+}
+
+/*
+ * Fast-commit commit callback. There is contention between sbi->s_fc_lock and
+ * i_data_sem. Locking order is - i_data_sem then s_fc_lock
+ */
+static int ext4_journal_fc_commit_cb(journal_t *journal, tid_t tid,
+				     tid_t subtid)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct list_head *pos, *tmp;
+	struct ext4_inode_info *iter;
+	struct jbd2_inode *jinode;
+	int num_bufs = 0, ret;
+
+	sbi = sbi;
+	mutex_lock(&sbi->s_fc_lock);
+	if (!sbi->s_fc_eligible) {
+		sbi->s_fc_eligible = true;
+		mutex_unlock(&sbi->s_fc_lock);
+		return -ECANCELED;
+	}
+
+	list_for_each_safe(pos, tmp, &sbi->s_fc_q) {
+		struct inode *inode;
+
+		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+		inode = &iter->vfs_inode;
+
+		mutex_unlock(&sbi->s_fc_lock);
+		/*
+		 * Release s_fc_lock here since fc_write_inode calls
+		 * ext4_map_blocks which needs i_data_sem.
+		 */
+		ret = ext4_fc_write_inode(journal, inode, tid, subtid);
+		if (ret < 0)
+			return ret;
+		mutex_lock(&sbi->s_fc_lock);
+
+		num_bufs += ret;
+	}
+
+	/* Submit data buffers first */
+	list_for_each(pos, &sbi->s_fc_q) {
+		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+		jinode = iter->jinode;
+		ret = jbd2_submit_inode_data(journal, jinode);
+		if (ret) {
+			mutex_unlock(&sbi->s_fc_lock);
+			return ret;
+		}
+	}
+
+	if (num_bufs == 0) {
+		mutex_unlock(&sbi->s_fc_lock);
+		return 0;
+	}
+
+	/*
+	 * Before returning, check if s_fc_eligible was modified since we
+	 * started.
+	 */
+	if (!sbi->s_fc_eligible) {
+		mutex_unlock(&sbi->s_fc_lock);
+		return -ECANCELED;
+	}
+
+	mutex_unlock(&sbi->s_fc_lock);
+
+	jbd_debug(3, "%s: Journal blocks ready for fast commit\n", __func__);
+
+	return jbd2_submit_fc_bufs(journal, ext4_end_buffer_io_sync);
+}
+
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
@@ -4723,7 +4931,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 	journal->j_commit_interval = sbi->s_commit_interval;
 	journal->j_min_batch_time = sbi->s_min_batch_time;
 	journal->j_max_batch_time = sbi->s_max_batch_time;
-
+	if (ext4_should_fast_commit(sb)) {
+		journal->j_fc_commit_callback = ext4_journal_fc_commit_cb;
+		journal->j_fc_cleanup_callback = ext4_journal_fc_cleanup_cb;
+	}
 	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 		journal->j_flags |= JBD2_BARRIER;
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 11/11] ext4: fast-commit recovery path changes
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (8 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 10/11] ext4: fast-commit commit path changes Harshad Shirwadkar
@ 2019-07-22  4:00 ` Harshad Shirwadkar
  2019-07-22 21:34   ` kbuild test robot
  2019-07-22 18:15 ` [PATCH 01/11] ext4: add handling for extended mount options Andreas Dilger
  10 siblings, 1 reply; 25+ messages in thread
From: Harshad Shirwadkar @ 2019-07-22  4:00 UTC (permalink / raw)
  To: linux-ext4; +Cc: Harshad Shirwadkar

This patch adds core fast-commit recovery path changes. Each fast
commit block stores modified extents for a particular file. Replay
code maps blocks in each such extent to the actual file one-by-one. We
also update corresponding file system metadata to account for newly
mapped blocks. In order to achieve all of these,
ext4_inode_csum_set(), ext4_inode_blocks() which were earlier static
are now made visible.

I updated e2fsprogs to set fast commit feature flag and to ignore fast
commit blocks during e2fsck. After applying all the patches in this
series, following runs of xfstests were performed:

- kvm-xfstest.sh -g log -c 4k
- kvm-xfstests.sh smoke

All the log tests were successful and smoke tests didn't introduce any
additional failures.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/balloc.c  |   7 ++-
 fs/ext4/ext4.h    |   4 ++
 fs/ext4/extents.c |  19 +++++---
 fs/ext4/inode.c   |   8 ++--
 fs/ext4/mballoc.c |  83 ++++++++++++++++++++++++++++++++
 fs/ext4/mballoc.h |   2 +
 fs/ext4/super.c   | 119 ++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 230 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..75c3025c7089 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -360,7 +360,12 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
 				      struct buffer_head *bh)
 {
 	ext4_fsblk_t	blk;
-	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+	struct ext4_group_info *grp;
+
+	if (EXT4_SB(sb)->fc_replay)
+		return 0;
+
+	grp = ext4_get_group_info(sb, block_group);
 
 	if (buffer_verified(bh))
 		return 0;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5d92a2e4f0af..44a4d16c241c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2577,6 +2577,10 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
 
 /* inode.c */
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+			 struct ext4_inode_info *ei);
+blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+			   struct ext4_inode_info *ei);
 int ext4_inode_is_fast_symlink(struct inode *inode);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 66f7f4fb1612..59fe596ce97d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2894,7 +2894,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path = NULL;
 	struct partial_cluster partial;
-	handle_t *handle;
+	handle_t *handle = NULL;
 	int i = 0, err = 0;
 
 	partial.pclu = 0;
@@ -2904,9 +2904,11 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	ext_debug("truncate since %u to %u\n", start, end);
 
 	/* probably first extent we're gonna free will be last in block */
-	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	if (!sbi->fc_replay) {
+		handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+	}
 
 again:
 	trace_ext4_ext_remove_space(inode, start, end, depth);
@@ -2926,7 +2928,8 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 		/* find extent for or closest extent to this block */
 		path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
 		if (IS_ERR(path)) {
-			ext4_journal_stop(handle);
+			if (!sbi->fc_replay)
+				ext4_journal_stop(handle);
 			return PTR_ERR(path);
 		}
 		depth = ext_depth(inode);
@@ -3012,7 +3015,8 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 		path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
 			       GFP_NOFS);
 		if (path == NULL) {
-			ext4_journal_stop(handle);
+			if (!sbi->fc_replay)
+				ext4_journal_stop(handle);
 			return -ENOMEM;
 		}
 		path[0].p_maxdepth = path[0].p_depth = depth;
@@ -3142,7 +3146,8 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 	path = NULL;
 	if (err == -EAGAIN)
 		goto again;
-	ext4_journal_stop(handle);
+	if (!sbi->fc_replay)
+		ext4_journal_stop(handle);
 
 	return err;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dd5d39a48363..21c9b5197c72 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -103,8 +103,8 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
 	return provided == calculated;
 }
 
-static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
-				struct ext4_inode_info *ei)
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+			 struct ext4_inode_info *ei)
 {
 	__u32 csum;
 
@@ -4801,8 +4801,8 @@ void ext4_set_inode_flags(struct inode *inode)
 			S_ENCRYPTED|S_CASEFOLD);
 }
 
-static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
-				  struct ext4_inode_info *ei)
+blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+			   struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a3e2767bdf2f..70551fa91237 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2915,6 +2915,89 @@ void ext4_exit_mballoc(void)
 }
 
 
+void ext4_mb_mark_used(struct super_block *sb, ext4_fsblk_t block,
+		       int len)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *gdp_bh;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_group_t group;
+	ext4_fsblk_t cluster;
+	ext4_grpblk_t blkoff;
+	int i, clen, err;
+	int already_allocated_count;
+
+	cluster = EXT4_B2C(sbi, block);
+	clen = EXT4_B2C(sbi, len);
+
+	ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
+	if (IS_ERR(bitmap_bh)) {
+		err = PTR_ERR(bitmap_bh);
+		bitmap_bh = NULL;
+		goto out_err;
+	}
+
+	err = -EIO;
+	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp)
+		goto out_err;
+
+	if (!ext4_data_block_valid(sbi, block, len)) {
+		ext4_error(sb, "Allocating blks %llu-%llu which overlap mdata",
+			   cluster, cluster+clen);
+		/* File system mounted not to panic on error
+		 * Fix the bitmap and return EFSCORRUPTED
+		 * We leak some of the blocks here.
+		 */
+		ext4_lock_group(sb, group);
+		ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
+		ext4_unlock_group(sb, group);
+		err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+		if (!err)
+			err = -EFSCORRUPTED;
+		goto out_err;
+	}
+
+	ext4_lock_group(sb, group);
+	already_allocated_count = 0;
+	for (i = 0; i < clen; i++)
+		if (mb_test_bit(blkoff + i, bitmap_bh->b_data))
+			already_allocated_count++;
+
+	ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
+	if (ext4_has_group_desc_csum(sb) &&
+	    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+		ext4_free_group_clusters_set(sb, gdp,
+					     ext4_free_clusters_after_init(sb,
+						group, gdp));
+	}
+	clen = ext4_free_group_clusters(sb, gdp) - clen +
+	       already_allocated_count;
+	ext4_free_group_clusters_set(sb, gdp, clen);
+	ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+	ext4_group_desc_csum_set(sb, group, gdp);
+
+	ext4_unlock_group(sb, group);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, group);
+
+		atomic64_sub(len,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+	}
+
+	err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+	if (err)
+		goto out_err;
+	err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+
+out_err:
+	brelse(bitmap_bh);
+}
+
 /*
  * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
  * Returns 0 if success or error code
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..1881710041b6 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -215,4 +215,6 @@ ext4_mballoc_query_range(
 	ext4_mballoc_query_range_fn	formatter,
 	void				*priv);
 
+void ext4_mb_mark_used(struct super_block *sb, ext4_fsblk_t block,
+		       int len);
 #endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a291d41b91de..f38ff2089389 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -408,6 +408,118 @@ static int block_device_ejected(struct super_block *sb)
 	return bdi->dev == NULL;
 }
 
+static void ext4_fc_add_block(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_fsblk_t pblk, int unwritten)
+{
+	struct ext4_extent ex;
+	struct ext4_ext_path *path = NULL;
+	struct ext4_map_blocks map;
+	int ret;
+
+	map.m_lblk = lblk;
+	map.m_len = 0x1;
+	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (ret > 0) {
+		if (pblk != map.m_pblk)
+			jbd_debug(1, "Bad mapping found while replaying fc\n");
+		return;
+	}
+
+	ex.ee_block = cpu_to_le32(lblk);
+	ext4_ext_store_pblock(&ex, pblk);
+	ex.ee_len = cpu_to_le32(0x1);
+	if (unwritten)
+		ext4_ext_mark_unwritten(&ex);
+
+	path = ext4_find_extent(inode, lblk, NULL, 0);
+	if (path) {
+		down_write(&EXT4_I(inode)->i_data_sem);
+		ret = ext4_ext_insert_extent(NULL, inode, &path, &ex, 0);
+		ext4_mb_mark_used(inode->i_sb, ext4_ext_pblock(&ex), 0x1);
+		up_write((&EXT4_I(inode)->i_data_sem));
+		kfree(path);
+	}
+}
+
+static int ext4_journal_fc_replay_cb(journal_t *journal, struct buffer_head *bh)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_fc_commit_hdr *fc_hdr;
+	struct ext4_fc_tl *tl;
+	struct ext4_iloc iloc;
+	struct ext4_extent *ex;
+	struct inode *inode;
+	int ret;
+
+	sbi->fc_replay = true;
+	fc_hdr = (struct ext4_fc_commit_hdr *)
+		  ((__u8 *)bh->b_data + sizeof(journal_header_t));
+
+	jbd_debug(3, "%s: Got FC block for inode %d at [%d,%d]", __func__,
+	       le32_to_cpu(fc_hdr->fc_ino), le32_to_cpu(fc_hdr->fc_tid),
+	       le32_to_cpu(fc_hdr->fc_subtid));
+
+	inode = ext4_iget(sb, le32_to_cpu(fc_hdr->fc_ino), EXT4_IGET_NORMAL);
+	if (IS_ERR(inode))
+		return 0;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	inode_lock(inode);
+	tl = (struct ext4_fc_tl *)(fc_hdr + 1);
+	while (le16_to_cpu(tl->fc_tag) == EXT4_FC_TAG_EXT) {
+		int i;
+
+		ex = (struct ext4_extent *)(tl + 1);
+		tl = (struct ext4_fc_tl *)((__u8 *)tl +
+					   le16_to_cpu(tl->fc_len) +
+					   sizeof(*tl));
+		/*
+		 * We add block by block because part of extent may already have
+		 * been added by a previous fast commit replay.
+		 */
+		for (i = 0; i < ext4_ext_get_actual_len(ex); i++)
+			ext4_fc_add_block(inode, le32_to_cpu(ex->ee_block) + i,
+					  ext4_ext_pblock(ex) + i,
+					  ext4_ext_is_unwritten(ex));
+	}
+
+	/*
+	 * Unless inode contains inline data, copy everything except
+	 * i_blocks. i_blocks would have been set alright by ext4_fc_add_block
+	 * call above.
+	 */
+	if (ext4_has_inline_data(inode)) {
+		memcpy(ext4_raw_inode(&iloc), &fc_hdr->inode,
+		       sizeof(struct ext4_inode));
+	} else {
+		memcpy(ext4_raw_inode(&iloc), &fc_hdr->inode,
+		       offsetof(struct ext4_inode, i_block));
+		memcpy(&ext4_raw_inode(&iloc)->i_generation,
+		       &fc_hdr->inode.i_generation,
+		       sizeof(struct ext4_inode) -
+		       offsetof(struct ext4_inode, i_generation));
+	}
+
+	ext4_reserve_inode_write(NULL, inode, &iloc);
+	inode_unlock(inode);
+	sbi->fc_replay = false;
+
+	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
+	ret = ext4_handle_dirty_metadata(NULL, inode, iloc.bh);
+	iput(inode);
+	if (!ret)
+		ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
+	brelse(iloc.bh);
+
+	return ret;
+}
+
+
 static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 {
 	struct super_block		*sb = journal->j_private;
@@ -4935,6 +5047,13 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_fc_commit_callback = ext4_journal_fc_commit_cb;
 		journal->j_fc_cleanup_callback = ext4_journal_fc_cleanup_cb;
 	}
+
+	/*
+	 * We set replay callback even if fast commit disabled because we may
+	 * could still have fast commit blocks that need to be replayed even if
+	 * fast commit has now been turned off.
+	 */
+	journal->j_fc_replay_callback = ext4_journal_fc_replay_cb;
 	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 		journal->j_flags |= JBD2_BARRIER;
-- 
2.22.0.657.g960e92d24f-goog


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 05/11] jbd2: fast-commit commit path new APIs
  2019-07-22  4:00 ` [PATCH 05/11] jbd2: fast-commit commit path new APIs Harshad Shirwadkar
@ 2019-07-22 17:45   ` kbuild test robot
  2019-07-22 21:02   ` kbuild test robot
  1 sibling, 0 replies; 25+ messages in thread
From: kbuild test robot @ 2019-07-22 17:45 UTC (permalink / raw)
  To: Harshad Shirwadkar; +Cc: kbuild-all, linux-ext4, Harshad Shirwadkar

[-- Attachment #1: Type: text/plain, Size: 8824 bytes --]

Hi Harshad,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.3-rc1 next-20190722]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Harshad-Shirwadkar/ext4-add-handling-for-extended-mount-options/20190723-001855
config: i386-defconfig (attached as .config)
compiler: gcc-7 (Debian 7.4.0-10) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/linkage.h:7:0,
                    from include/linux/kernel.h:8,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from fs/jbd2/journal.c:22:
>> include/linux/export.h:81:20: error: redefinition of '__kstrtab_jbd2_complete_transaction'
     static const char __kstrtab_##sym[]    \
                       ^
>> include/linux/export.h:120:25: note: in expansion of macro '___EXPORT_SYMBOL'
    #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
                            ^~~~~~~~~~~~~~~~
>> include/linux/export.h:124:2: note: in expansion of macro '__EXPORT_SYMBOL'
     __EXPORT_SYMBOL(sym, "")
     ^~~~~~~~~~~~~~~
>> fs/jbd2/journal.c:839:1: note: in expansion of macro 'EXPORT_SYMBOL'
    EXPORT_SYMBOL(jbd2_complete_transaction);
    ^~~~~~~~~~~~~
   include/linux/export.h:81:20: note: previous definition of '__kstrtab_jbd2_complete_transaction' was here
     static const char __kstrtab_##sym[]    \
                       ^
>> include/linux/export.h:120:25: note: in expansion of macro '___EXPORT_SYMBOL'
    #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
                            ^~~~~~~~~~~~~~~~
>> include/linux/export.h:124:2: note: in expansion of macro '__EXPORT_SYMBOL'
     __EXPORT_SYMBOL(sym, "")
     ^~~~~~~~~~~~~~~
   fs/jbd2/journal.c:812:1: note: in expansion of macro 'EXPORT_SYMBOL'
    EXPORT_SYMBOL(jbd2_complete_transaction);
    ^~~~~~~~~~~~~

vim +/__kstrtab_jbd2_complete_transaction +81 include/linux/export.h

7290d58095712a8 Ard Biesheuvel  2018-08-21   76  
f50169324df4ad9 Paul Gortmaker  2011-05-23   77  /* For every exported symbol, place a struct in the __ksymtab section */
f235541699bcf14 Nicolas Pitre   2016-01-22   78  #define ___EXPORT_SYMBOL(sym, sec)					\
f50169324df4ad9 Paul Gortmaker  2011-05-23   79  	extern typeof(sym) sym;						\
f50169324df4ad9 Paul Gortmaker  2011-05-23   80  	__CRC_SYMBOL(sym, sec)						\
f50169324df4ad9 Paul Gortmaker  2011-05-23  @81  	static const char __kstrtab_##sym[]				\
7290d58095712a8 Ard Biesheuvel  2018-08-21   82  	__attribute__((section("__ksymtab_strings"), used, aligned(1)))	\
94e58e0ac31284f Masahiro Yamada 2018-05-09   83  	= #sym;								\
7290d58095712a8 Ard Biesheuvel  2018-08-21   84  	__KSYMTAB_ENTRY(sym, sec)
f50169324df4ad9 Paul Gortmaker  2011-05-23   85  
f922c4abdf76485 Ard Biesheuvel  2018-08-21   86  #if defined(__DISABLE_EXPORTS)
f922c4abdf76485 Ard Biesheuvel  2018-08-21   87  
f922c4abdf76485 Ard Biesheuvel  2018-08-21   88  /*
f922c4abdf76485 Ard Biesheuvel  2018-08-21   89   * Allow symbol exports to be disabled completely so that C code may
f922c4abdf76485 Ard Biesheuvel  2018-08-21   90   * be reused in other execution contexts such as the UEFI stub or the
f922c4abdf76485 Ard Biesheuvel  2018-08-21   91   * decompressor.
f922c4abdf76485 Ard Biesheuvel  2018-08-21   92   */
f922c4abdf76485 Ard Biesheuvel  2018-08-21   93  #define __EXPORT_SYMBOL(sym, sec)
f922c4abdf76485 Ard Biesheuvel  2018-08-21   94  
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30   95  #elif defined(CONFIG_TRIM_UNUSED_KSYMS)
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30   96  
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30   97  #include <generated/autoksyms.h>
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22   98  
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22   99  /*
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22  100   * For fine grained build dependencies, we want to tell the build system
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22  101   * about each possible exported symbol even if they're not actually exported.
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  102   * We use a symbol pattern __ksym_marker_<symbol> that the build system filters
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  103   * from the $(NM) output (see scripts/gen_ksymdeps.sh). These symbols are
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  104   * discarded in the final link stage.
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22  105   */
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  106  #define __ksym_marker(sym)	\
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  107  	static int __ksym_marker_##sym[0] __section(".discard.ksym") __used
f235541699bcf14 Nicolas Pitre   2016-01-22  108  
f235541699bcf14 Nicolas Pitre   2016-01-22  109  #define __EXPORT_SYMBOL(sym, sec)				\
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  110  	__ksym_marker(sym);					\
6023d2369ba7b82 Masahiro Yamada 2016-06-14  111  	__cond_export_sym(sym, sec, __is_defined(__KSYM_##sym))
f235541699bcf14 Nicolas Pitre   2016-01-22  112  #define __cond_export_sym(sym, sec, conf)			\
f235541699bcf14 Nicolas Pitre   2016-01-22  113  	___cond_export_sym(sym, sec, conf)
f235541699bcf14 Nicolas Pitre   2016-01-22  114  #define ___cond_export_sym(sym, sec, enabled)			\
f235541699bcf14 Nicolas Pitre   2016-01-22  115  	__cond_export_sym_##enabled(sym, sec)
f235541699bcf14 Nicolas Pitre   2016-01-22  116  #define __cond_export_sym_1(sym, sec) ___EXPORT_SYMBOL(sym, sec)
f235541699bcf14 Nicolas Pitre   2016-01-22  117  #define __cond_export_sym_0(sym, sec) /* nothing */
f235541699bcf14 Nicolas Pitre   2016-01-22  118  
f235541699bcf14 Nicolas Pitre   2016-01-22  119  #else
f235541699bcf14 Nicolas Pitre   2016-01-22 @120  #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
f235541699bcf14 Nicolas Pitre   2016-01-22  121  #endif
f235541699bcf14 Nicolas Pitre   2016-01-22  122  
f50169324df4ad9 Paul Gortmaker  2011-05-23  123  #define EXPORT_SYMBOL(sym)					\
f50169324df4ad9 Paul Gortmaker  2011-05-23 @124  	__EXPORT_SYMBOL(sym, "")
f50169324df4ad9 Paul Gortmaker  2011-05-23  125  
f50169324df4ad9 Paul Gortmaker  2011-05-23  126  #define EXPORT_SYMBOL_GPL(sym)					\
f50169324df4ad9 Paul Gortmaker  2011-05-23  127  	__EXPORT_SYMBOL(sym, "_gpl")
f50169324df4ad9 Paul Gortmaker  2011-05-23  128  
f50169324df4ad9 Paul Gortmaker  2011-05-23  129  #define EXPORT_SYMBOL_GPL_FUTURE(sym)				\
f50169324df4ad9 Paul Gortmaker  2011-05-23  130  	__EXPORT_SYMBOL(sym, "_gpl_future")
f50169324df4ad9 Paul Gortmaker  2011-05-23  131  
f50169324df4ad9 Paul Gortmaker  2011-05-23  132  #ifdef CONFIG_UNUSED_SYMBOLS
f50169324df4ad9 Paul Gortmaker  2011-05-23  133  #define EXPORT_UNUSED_SYMBOL(sym) __EXPORT_SYMBOL(sym, "_unused")
f50169324df4ad9 Paul Gortmaker  2011-05-23  134  #define EXPORT_UNUSED_SYMBOL_GPL(sym) __EXPORT_SYMBOL(sym, "_unused_gpl")
f50169324df4ad9 Paul Gortmaker  2011-05-23  135  #else
f50169324df4ad9 Paul Gortmaker  2011-05-23  136  #define EXPORT_UNUSED_SYMBOL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  137  #define EXPORT_UNUSED_SYMBOL_GPL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  138  #endif
f50169324df4ad9 Paul Gortmaker  2011-05-23  139  
f50169324df4ad9 Paul Gortmaker  2011-05-23  140  #endif	/* __GENKSYMS__ */
f50169324df4ad9 Paul Gortmaker  2011-05-23  141  
f50169324df4ad9 Paul Gortmaker  2011-05-23  142  #else /* !CONFIG_MODULES... */
f50169324df4ad9 Paul Gortmaker  2011-05-23  143  
f50169324df4ad9 Paul Gortmaker  2011-05-23  144  #define EXPORT_SYMBOL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  145  #define EXPORT_SYMBOL_GPL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  146  #define EXPORT_SYMBOL_GPL_FUTURE(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  147  #define EXPORT_UNUSED_SYMBOL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  148  #define EXPORT_UNUSED_SYMBOL_GPL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  149  
f50169324df4ad9 Paul Gortmaker  2011-05-23  150  #endif /* CONFIG_MODULES */
b92021b09df70c1 Rusty Russell   2013-03-15  151  #endif /* !__ASSEMBLY__ */
f50169324df4ad9 Paul Gortmaker  2011-05-23  152  
f50169324df4ad9 Paul Gortmaker  2011-05-23  153  #endif /* _LINUX_EXPORT_H */

:::::: The code at line 81 was first introduced by commit
:::::: f50169324df4ad942e544386d136216c8617636a module.h: split out the EXPORT_SYMBOL into export.h

:::::: TO: Paul Gortmaker <paul.gortmaker@windriver.com>
:::::: CC: Paul Gortmaker <paul.gortmaker@windriver.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28082 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
                   ` (9 preceding siblings ...)
  2019-07-22  4:00 ` [PATCH 11/11] ext4: fast-commit recovery " Harshad Shirwadkar
@ 2019-07-22 18:15 ` Andreas Dilger
  2019-07-22 21:02   ` Theodore Y. Ts'o
  10 siblings, 1 reply; 25+ messages in thread
From: Andreas Dilger @ 2019-07-22 18:15 UTC (permalink / raw)
  To: Harshad Shirwadkar; +Cc: Ext4 Developers List

[-- Attachment #1: Type: text/plain, Size: 6859 bytes --]

Unless I missed it, this patch series needs a 00/11 email that describes
*what* "fast commit" is, and why we want it.  This should include some
benchmark results, since (I'd assume) that the "fast" part of the feature
name implies a performance improvement?

Cheers, Andreas

> On Jul 21, 2019, at 10:00 PM, Harshad Shirwadkar <harshadshirwadkar@gmail.com> wrote:
> 
> We are running out of mount option bits. This patch adds handling for
> using s_mount_opt2 and also adds ability to turn on / off the fast
> commit feature. In order to use fast commits, new version e2fsprogs
> needs to set the fast feature commit flag. This also makes sure that
> we have fast commit compatible e2fsprogs before starting to use the
> feature. Mount flag "no_fastcommit", introuced in this patch, can be
> passed to disable the feature at mount time.
> 
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> ---
> fs/ext4/ext4.h       |  4 ++++
> fs/ext4/super.c      | 27 ++++++++++++++++++++++-----
> include/linux/jbd2.h |  5 ++++-
> 3 files changed, 30 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index bf660aa7a9e0..becbda38b7db 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1146,6 +1146,8 @@ struct ext4_inode_info {
> #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM	0x00000008 /* User explicitly
> 						specified journal checksum */
> 
> +#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT	0x00000010 /* Journal fast commit */
> +
> #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
> 						~EXT4_MOUNT_##opt
> #define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
> @@ -1643,6 +1645,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
> #define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
> #define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
> #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200
> +#define EXT4_FEATURE_COMPAT_FAST_COMMIT		0x0400
> 
> #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
> #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
> @@ -1743,6 +1746,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr,		EXT_ATTR)
> EXT4_FEATURE_COMPAT_FUNCS(resize_inode,		RESIZE_INODE)
> EXT4_FEATURE_COMPAT_FUNCS(dir_index,		DIR_INDEX)
> EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,	SPARSE_SUPER2)
> +EXT4_FEATURE_COMPAT_FUNCS(fast_commit,		FAST_COMMIT)
> 
> EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,	SPARSE_SUPER)
> EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,	LARGE_FILE)
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 4079605d437a..e376ac040cce 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1455,6 +1455,7 @@ enum {
> 	Opt_dioread_nolock, Opt_dioread_lock,
> 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> +	Opt_no_fastcommit
> };
> 
> static const match_table_t tokens = {
> @@ -1537,6 +1538,7 @@ static const match_table_t tokens = {
> 	{Opt_init_itable, "init_itable=%u"},
> 	{Opt_init_itable, "init_itable"},
> 	{Opt_noinit_itable, "noinit_itable"},
> +	{Opt_no_fastcommit, "no_fastcommit"},
> 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
> 	{Opt_test_dummy_encryption, "test_dummy_encryption"},
> 	{Opt_nombcache, "nombcache"},
> @@ -1659,6 +1661,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
> #define MOPT_NO_EXT3	0x0200
> #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
> #define MOPT_STRING	0x0400
> +#define MOPT_2		0x0800
> 
> static const struct mount_opts {
> 	int	token;
> @@ -1751,6 +1754,8 @@ static const struct mount_opts {
> 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
> 	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
> 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> +	{Opt_no_fastcommit, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
> +	 MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
> 	{Opt_err, 0, 0}
> };
> 
> @@ -1858,8 +1863,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
> 			set_opt2(sb, EXPLICIT_DELALLOC);
> 		} else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
> 			set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
> -		} else
> +		} else if (m->mount_opt) {
> 			return -1;
> +		}
> 	}
> 	if (m->flags & MOPT_CLEAR_ERR)
> 		clear_opt(sb, ERRORS_MASK);
> @@ -2027,10 +2033,17 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
> 			WARN_ON(1);
> 			return -1;
> 		}
> -		if (arg != 0)
> -			sbi->s_mount_opt |= m->mount_opt;
> -		else
> -			sbi->s_mount_opt &= ~m->mount_opt;
> +		if (m->flags & MOPT_2) {
> +			if (arg != 0)
> +				sbi->s_mount_opt2 |= m->mount_opt;
> +			else
> +				sbi->s_mount_opt2 &= ~m->mount_opt;
> +		} else {
> +			if (arg != 0)
> +				sbi->s_mount_opt |= m->mount_opt;
> +			else
> +				sbi->s_mount_opt &= ~m->mount_opt;
> +		}
> 	}
> 	return 1;
> }
> @@ -3733,6 +3746,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> #ifdef CONFIG_EXT4_FS_POSIX_ACL
> 	set_opt(sb, POSIX_ACL);
> #endif
> +	if (ext4_has_feature_fast_commit(sb))
> +		set_opt2(sb, JOURNAL_FAST_COMMIT);
> +
> 	/* don't forget to enable journal_csum when metadata_csum is enabled. */
> 	if (ext4_has_metadata_csum(sb))
> 		set_opt(sb, JOURNAL_CHECKSUM);
> @@ -4334,6 +4350,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> 		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
> 		clear_opt(sb, JOURNAL_CHECKSUM);
> 		clear_opt(sb, DATA_FLAGS);
> +		clear_opt2(sb, JOURNAL_FAST_COMMIT);
> 		sbi->s_journal = NULL;
> 		needs_recovery = 0;
> 		goto no_journal;
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index df03825ad1a1..b7eed49b8ecd 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -288,6 +288,7 @@ typedef struct journal_superblock_s
> #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT	0x00000004
> #define JBD2_FEATURE_INCOMPAT_CSUM_V2		0x00000008
> #define JBD2_FEATURE_INCOMPAT_CSUM_V3		0x00000010
> +#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT	0x00000020
> 
> /* See "journal feature predicate functions" below */
> 
> @@ -298,7 +299,8 @@ typedef struct journal_superblock_s
> 					JBD2_FEATURE_INCOMPAT_64BIT | \
> 					JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
> 					JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
> -					JBD2_FEATURE_INCOMPAT_CSUM_V3)
> +					JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
> +					JBD2_FEATURE_INCOMPAT_FAST_COMMIT)
> 
> #ifdef __KERNEL__
> 
> @@ -1235,6 +1237,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(64bit,		64BIT)
> JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,	ASYNC_COMMIT)
> JBD2_FEATURE_INCOMPAT_FUNCS(csum2,		CSUM_V2)
> JBD2_FEATURE_INCOMPAT_FUNCS(csum3,		CSUM_V3)
> +JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
> 
> /*
>  * Journal flag definitions
> --
> 2.22.0.657.g960e92d24f-goog
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-22 18:15 ` [PATCH 01/11] ext4: add handling for extended mount options Andreas Dilger
@ 2019-07-22 21:02   ` Theodore Y. Ts'o
  2019-07-22 21:36     ` harshad shirwadkar
  2019-07-23 21:59     ` Andreas Dilger
  0 siblings, 2 replies; 25+ messages in thread
From: Theodore Y. Ts'o @ 2019-07-22 21:02 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Harshad Shirwadkar, Ext4 Developers List

On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> Unless I missed it, this patch series needs a 00/11 email that describes
> *what* "fast commit" is, and why we want it.  This should include some
> benchmark results, since (I'd assume) that the "fast" part of the feature
> name implies a performance improvement?

For background, it's a simplified version of the scheme proposed by
Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
for Improving the Latency of Fsync System Call"[1]

[1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park

I agree we should have a cover letter for this patch series.  Also, we
should add documentation to Documentation/filesystems/journaling.rst
about this feature; what it does, how it works, its basic on-disk
format changes, etc.

The fs/jbd2 layer isn't as well documented as the fs/ext4 code, and
bringing Documentation/filesystems/journaling.rst to the same level as
Documentation/filesystems/ext4/* isn't a fair/reasonable request.  On
the other hand, documenting what is being added by this patch series
is something that I think we should do.

   	     	    	     	    - Ted

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 05/11] jbd2: fast-commit commit path new APIs
  2019-07-22  4:00 ` [PATCH 05/11] jbd2: fast-commit commit path new APIs Harshad Shirwadkar
  2019-07-22 17:45   ` kbuild test robot
@ 2019-07-22 21:02   ` kbuild test robot
  1 sibling, 0 replies; 25+ messages in thread
From: kbuild test robot @ 2019-07-22 21:02 UTC (permalink / raw)
  To: Harshad Shirwadkar; +Cc: kbuild-all, linux-ext4, Harshad Shirwadkar

[-- Attachment #1: Type: text/plain, Size: 10503 bytes --]

Hi Harshad,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[cannot apply to v5.3-rc1 next-20190722]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Harshad-Shirwadkar/ext4-add-handling-for-extended-mount-options/20190723-001855
config: mips-allmodconfig (attached as .config)
compiler: mips-linux-gcc (GCC) 7.4.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.4.0 make.cross ARCH=mips 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/linkage.h:7:0,
                    from include/linux/kernel.h:8,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from fs/jbd2/journal.c:22:
   include/linux/export.h:81:20: error: redefinition of '__kstrtab_jbd2_complete_transaction'
     static const char __kstrtab_##sym[]    \
                       ^
   include/linux/export.h:120:25: note: in expansion of macro '___EXPORT_SYMBOL'
    #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
                            ^~~~~~~~~~~~~~~~
   include/linux/export.h:124:2: note: in expansion of macro '__EXPORT_SYMBOL'
     __EXPORT_SYMBOL(sym, "")
     ^~~~~~~~~~~~~~~
   fs/jbd2/journal.c:839:1: note: in expansion of macro 'EXPORT_SYMBOL'
    EXPORT_SYMBOL(jbd2_complete_transaction);
    ^~~~~~~~~~~~~
   include/linux/export.h:81:20: note: previous definition of '__kstrtab_jbd2_complete_transaction' was here
     static const char __kstrtab_##sym[]    \
                       ^
   include/linux/export.h:120:25: note: in expansion of macro '___EXPORT_SYMBOL'
    #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
                            ^~~~~~~~~~~~~~~~
   include/linux/export.h:124:2: note: in expansion of macro '__EXPORT_SYMBOL'
     __EXPORT_SYMBOL(sym, "")
     ^~~~~~~~~~~~~~~
   fs/jbd2/journal.c:812:1: note: in expansion of macro 'EXPORT_SYMBOL'
    EXPORT_SYMBOL(jbd2_complete_transaction);
    ^~~~~~~~~~~~~
   include/linux/export.h:67:36: error: redefinition of '__ksymtab_jbd2_complete_transaction'
     static const struct kernel_symbol __ksymtab_##sym  \
                                       ^
>> include/linux/export.h:84:2: note: in expansion of macro '__KSYMTAB_ENTRY'
     __KSYMTAB_ENTRY(sym, sec)
     ^~~~~~~~~~~~~~~
   include/linux/export.h:120:25: note: in expansion of macro '___EXPORT_SYMBOL'
    #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
                            ^~~~~~~~~~~~~~~~
   include/linux/export.h:124:2: note: in expansion of macro '__EXPORT_SYMBOL'
     __EXPORT_SYMBOL(sym, "")
     ^~~~~~~~~~~~~~~
   fs/jbd2/journal.c:839:1: note: in expansion of macro 'EXPORT_SYMBOL'
    EXPORT_SYMBOL(jbd2_complete_transaction);
    ^~~~~~~~~~~~~
   include/linux/export.h:67:36: note: previous definition of '__ksymtab_jbd2_complete_transaction' was here
     static const struct kernel_symbol __ksymtab_##sym  \
                                       ^
>> include/linux/export.h:84:2: note: in expansion of macro '__KSYMTAB_ENTRY'
     __KSYMTAB_ENTRY(sym, sec)
     ^~~~~~~~~~~~~~~
   include/linux/export.h:120:25: note: in expansion of macro '___EXPORT_SYMBOL'
    #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
                            ^~~~~~~~~~~~~~~~
   include/linux/export.h:124:2: note: in expansion of macro '__EXPORT_SYMBOL'
     __EXPORT_SYMBOL(sym, "")
     ^~~~~~~~~~~~~~~
   fs/jbd2/journal.c:812:1: note: in expansion of macro 'EXPORT_SYMBOL'
    EXPORT_SYMBOL(jbd2_complete_transaction);
    ^~~~~~~~~~~~~

vim +/__KSYMTAB_ENTRY +84 include/linux/export.h

7290d58095712a8 Ard Biesheuvel  2018-08-21   76  
f50169324df4ad9 Paul Gortmaker  2011-05-23   77  /* For every exported symbol, place a struct in the __ksymtab section */
f235541699bcf14 Nicolas Pitre   2016-01-22   78  #define ___EXPORT_SYMBOL(sym, sec)					\
f50169324df4ad9 Paul Gortmaker  2011-05-23   79  	extern typeof(sym) sym;						\
f50169324df4ad9 Paul Gortmaker  2011-05-23   80  	__CRC_SYMBOL(sym, sec)						\
f50169324df4ad9 Paul Gortmaker  2011-05-23   81  	static const char __kstrtab_##sym[]				\
7290d58095712a8 Ard Biesheuvel  2018-08-21   82  	__attribute__((section("__ksymtab_strings"), used, aligned(1)))	\
94e58e0ac31284f Masahiro Yamada 2018-05-09   83  	= #sym;								\
7290d58095712a8 Ard Biesheuvel  2018-08-21  @84  	__KSYMTAB_ENTRY(sym, sec)
f50169324df4ad9 Paul Gortmaker  2011-05-23   85  
f922c4abdf76485 Ard Biesheuvel  2018-08-21   86  #if defined(__DISABLE_EXPORTS)
f922c4abdf76485 Ard Biesheuvel  2018-08-21   87  
f922c4abdf76485 Ard Biesheuvel  2018-08-21   88  /*
f922c4abdf76485 Ard Biesheuvel  2018-08-21   89   * Allow symbol exports to be disabled completely so that C code may
f922c4abdf76485 Ard Biesheuvel  2018-08-21   90   * be reused in other execution contexts such as the UEFI stub or the
f922c4abdf76485 Ard Biesheuvel  2018-08-21   91   * decompressor.
f922c4abdf76485 Ard Biesheuvel  2018-08-21   92   */
f922c4abdf76485 Ard Biesheuvel  2018-08-21   93  #define __EXPORT_SYMBOL(sym, sec)
f922c4abdf76485 Ard Biesheuvel  2018-08-21   94  
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30   95  #elif defined(CONFIG_TRIM_UNUSED_KSYMS)
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30   96  
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30   97  #include <generated/autoksyms.h>
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22   98  
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22   99  /*
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22  100   * For fine grained build dependencies, we want to tell the build system
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22  101   * about each possible exported symbol even if they're not actually exported.
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  102   * We use a symbol pattern __ksym_marker_<symbol> that the build system filters
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  103   * from the $(NM) output (see scripts/gen_ksymdeps.sh). These symbols are
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  104   * discarded in the final link stage.
c1a95fda2a40ae8 Nicolas Pitre   2016-01-22  105   */
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  106  #define __ksym_marker(sym)	\
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  107  	static int __ksym_marker_##sym[0] __section(".discard.ksym") __used
f235541699bcf14 Nicolas Pitre   2016-01-22  108  
f235541699bcf14 Nicolas Pitre   2016-01-22  109  #define __EXPORT_SYMBOL(sym, sec)				\
bbda5ec671d3fe6 Masahiro Yamada 2018-11-30  110  	__ksym_marker(sym);					\
6023d2369ba7b82 Masahiro Yamada 2016-06-14  111  	__cond_export_sym(sym, sec, __is_defined(__KSYM_##sym))
f235541699bcf14 Nicolas Pitre   2016-01-22  112  #define __cond_export_sym(sym, sec, conf)			\
f235541699bcf14 Nicolas Pitre   2016-01-22  113  	___cond_export_sym(sym, sec, conf)
f235541699bcf14 Nicolas Pitre   2016-01-22  114  #define ___cond_export_sym(sym, sec, enabled)			\
f235541699bcf14 Nicolas Pitre   2016-01-22  115  	__cond_export_sym_##enabled(sym, sec)
f235541699bcf14 Nicolas Pitre   2016-01-22  116  #define __cond_export_sym_1(sym, sec) ___EXPORT_SYMBOL(sym, sec)
f235541699bcf14 Nicolas Pitre   2016-01-22  117  #define __cond_export_sym_0(sym, sec) /* nothing */
f235541699bcf14 Nicolas Pitre   2016-01-22  118  
f235541699bcf14 Nicolas Pitre   2016-01-22  119  #else
f235541699bcf14 Nicolas Pitre   2016-01-22  120  #define __EXPORT_SYMBOL ___EXPORT_SYMBOL
f235541699bcf14 Nicolas Pitre   2016-01-22  121  #endif
f235541699bcf14 Nicolas Pitre   2016-01-22  122  
f50169324df4ad9 Paul Gortmaker  2011-05-23  123  #define EXPORT_SYMBOL(sym)					\
f50169324df4ad9 Paul Gortmaker  2011-05-23 @124  	__EXPORT_SYMBOL(sym, "")
f50169324df4ad9 Paul Gortmaker  2011-05-23  125  
f50169324df4ad9 Paul Gortmaker  2011-05-23  126  #define EXPORT_SYMBOL_GPL(sym)					\
f50169324df4ad9 Paul Gortmaker  2011-05-23  127  	__EXPORT_SYMBOL(sym, "_gpl")
f50169324df4ad9 Paul Gortmaker  2011-05-23  128  
f50169324df4ad9 Paul Gortmaker  2011-05-23  129  #define EXPORT_SYMBOL_GPL_FUTURE(sym)				\
f50169324df4ad9 Paul Gortmaker  2011-05-23  130  	__EXPORT_SYMBOL(sym, "_gpl_future")
f50169324df4ad9 Paul Gortmaker  2011-05-23  131  
f50169324df4ad9 Paul Gortmaker  2011-05-23  132  #ifdef CONFIG_UNUSED_SYMBOLS
f50169324df4ad9 Paul Gortmaker  2011-05-23  133  #define EXPORT_UNUSED_SYMBOL(sym) __EXPORT_SYMBOL(sym, "_unused")
f50169324df4ad9 Paul Gortmaker  2011-05-23  134  #define EXPORT_UNUSED_SYMBOL_GPL(sym) __EXPORT_SYMBOL(sym, "_unused_gpl")
f50169324df4ad9 Paul Gortmaker  2011-05-23  135  #else
f50169324df4ad9 Paul Gortmaker  2011-05-23  136  #define EXPORT_UNUSED_SYMBOL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  137  #define EXPORT_UNUSED_SYMBOL_GPL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  138  #endif
f50169324df4ad9 Paul Gortmaker  2011-05-23  139  
f50169324df4ad9 Paul Gortmaker  2011-05-23  140  #endif	/* __GENKSYMS__ */
f50169324df4ad9 Paul Gortmaker  2011-05-23  141  
f50169324df4ad9 Paul Gortmaker  2011-05-23  142  #else /* !CONFIG_MODULES... */
f50169324df4ad9 Paul Gortmaker  2011-05-23  143  
f50169324df4ad9 Paul Gortmaker  2011-05-23  144  #define EXPORT_SYMBOL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  145  #define EXPORT_SYMBOL_GPL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  146  #define EXPORT_SYMBOL_GPL_FUTURE(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  147  #define EXPORT_UNUSED_SYMBOL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  148  #define EXPORT_UNUSED_SYMBOL_GPL(sym)
f50169324df4ad9 Paul Gortmaker  2011-05-23  149  
f50169324df4ad9 Paul Gortmaker  2011-05-23  150  #endif /* CONFIG_MODULES */
b92021b09df70c1 Rusty Russell   2013-03-15  151  #endif /* !__ASSEMBLY__ */
f50169324df4ad9 Paul Gortmaker  2011-05-23  152  
f50169324df4ad9 Paul Gortmaker  2011-05-23  153  #endif /* _LINUX_EXPORT_H */

:::::: The code at line 84 was first introduced by commit
:::::: 7290d58095712a89f845e1bca05334796dd49ed2 module: use relative references for __ksymtab entries

:::::: TO: Ard Biesheuvel <ard.biesheuvel@linaro.org>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 61478 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 11/11] ext4: fast-commit recovery path changes
  2019-07-22  4:00 ` [PATCH 11/11] ext4: fast-commit recovery " Harshad Shirwadkar
@ 2019-07-22 21:34   ` kbuild test robot
  0 siblings, 0 replies; 25+ messages in thread
From: kbuild test robot @ 2019-07-22 21:34 UTC (permalink / raw)
  To: Harshad Shirwadkar; +Cc: kbuild-all, linux-ext4, Harshad Shirwadkar

[-- Attachment #1: Type: text/plain, Size: 2071 bytes --]

Hi Harshad,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[cannot apply to v5.3-rc1 next-20190722]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Harshad-Shirwadkar/ext4-add-handling-for-extended-mount-options/20190723-001855
config: mips-allmodconfig (attached as .config)
compiler: mips-linux-gcc (GCC) 7.4.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.4.0 make.cross ARCH=mips 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/byteorder/big_endian.h:5:0,
                    from arch/mips/include/uapi/asm/byteorder.h:13,
                    from arch/mips/include/asm/bitops.h:19,
                    from include/linux/bitops.h:19,
                    from include/linux/kernel.h:12,
                    from include/linux/list.h:9,
                    from include/linux/module.h:9,
                    from fs/ext4/super.c:20:
   fs/ext4/super.c: In function 'ext4_fc_add_block':
>> include/uapi/linux/byteorder/big_endian.h:33:26: warning: large integer implicitly truncated to unsigned type [-Woverflow]
    #define __cpu_to_le32(x) ((__force __le32)__swab32((x)))
                             ^
   include/linux/byteorder/generic.h:88:21: note: in expansion of macro '__cpu_to_le32'
    #define cpu_to_le32 __cpu_to_le32
                        ^~~~~~~~~~~~~
>> fs/ext4/super.c:430:14: note: in expansion of macro 'cpu_to_le32'
     ex.ee_len = cpu_to_le32(0x1);
                 ^~~~~~~~~~~

vim +/cpu_to_le32 +430 fs/ext4/super.c

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 61478 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-22 21:02   ` Theodore Y. Ts'o
@ 2019-07-22 21:36     ` harshad shirwadkar
  2019-07-23 21:59     ` Andreas Dilger
  1 sibling, 0 replies; 25+ messages in thread
From: harshad shirwadkar @ 2019-07-22 21:36 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: Andreas Dilger, Ext4 Developers List

Thanks Andreas. Thanks Ted for sharing the original paper link. I'll
submit a patch 00/11 with proposed documentation and cover letter
describing the feature and results from benchmarks.


On Mon, Jul 22, 2019 at 2:02 PM Theodore Y. Ts'o <tytso@mit.edu> wrote:
>
> On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> > Unless I missed it, this patch series needs a 00/11 email that describes
> > *what* "fast commit" is, and why we want it.  This should include some
> > benchmark results, since (I'd assume) that the "fast" part of the feature
> > name implies a performance improvement?
>
> For background, it's a simplified version of the scheme proposed by
> Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> for Improving the Latency of Fsync System Call"[1]
>
> [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
>
> I agree we should have a cover letter for this patch series.  Also, we
> should add documentation to Documentation/filesystems/journaling.rst
> about this feature; what it does, how it works, its basic on-disk
> format changes, etc.
>
> The fs/jbd2 layer isn't as well documented as the fs/ext4 code, and
> bringing Documentation/filesystems/journaling.rst to the same level as
> Documentation/filesystems/ext4/* isn't a fair/reasonable request.  On
> the other hand, documenting what is being added by this patch series
> is something that I think we should do.
>
>                                     - Ted

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-22 21:02   ` Theodore Y. Ts'o
  2019-07-22 21:36     ` harshad shirwadkar
@ 2019-07-23 21:59     ` Andreas Dilger
  2019-07-24  6:03       ` harshad shirwadkar
  1 sibling, 1 reply; 25+ messages in thread
From: Andreas Dilger @ 2019-07-23 21:59 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: Harshad Shirwadkar, Ext4 Developers List

[-- Attachment #1: Type: text/plain, Size: 2149 bytes --]

On Jul 22, 2019, at 3:02 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
>> Unless I missed it, this patch series needs a 00/11 email that describes
>> *what* "fast commit" is, and why we want it.  This should include some
>> benchmark results, since (I'd assume) that the "fast" part of the feature
>> name implies a performance improvement?
> 
> For background, it's a simplified version of the scheme proposed by
> Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> for Improving the Latency of Fsync System Call"[1]
> 
> [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
> 
> I agree we should have a cover letter for this patch series.  Also, we
> should add documentation to Documentation/filesystems/journaling.rst
> about this feature; what it does, how it works, its basic on-disk
> format changes, etc.

Thanks for the link, I hadn't read that paper previously.  From reading the
paper, it seems there are some things that should be addressed before the
patch is committed to the tree in order to maintain proper disk format
compatibility:
- the ijournal header shows a 256-byte inode.  In Lustre today (and also
  Samba and other xattr-intensive workloads) 512- or 1024-byte inodes are used
  in order to store more xattrs within the inode, so the size of the inode
  data in the ijournal header needs to match the actual inode size of the
  filesystem and not be a fixed size.  What if the inode size == blocksize?
- the ijournal header also shows a 4-byte inode number.  It would be prudent
  to reserve space for 64-bit inode numbers, or at least have some mechanism
  (flag) to indicate that a 64-bit inode is stored instead of a 32-bit inode.
- if there are many cores in a system, say 96, how much space will be used
  from the journal file by the per-core ijournal?
- what happens if multiple threads are writing to the same file with ijournal
  and per-core ijournal areas?  Will the same inode information be recorded
  in multiple ijournal areas?

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-23 21:59     ` Andreas Dilger
@ 2019-07-24  6:03       ` harshad shirwadkar
  2019-07-24  6:12         ` Darrick J. Wong
  0 siblings, 1 reply; 25+ messages in thread
From: harshad shirwadkar @ 2019-07-24  6:03 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Theodore Y. Ts'o, Ext4 Developers List

Before I respond to your questions, I would like to explain how fast
commits differ from ijournal in a few key aspects (I will make sure to
explain it in detail in patch 00/11 and documentation):

- Instead of storing extent blocks in a fast commit block, we only
store extents that were modified in a particular fast commit
transaction in tag-length-value format.

- Whenever the fast commit information (inode structure + changed
extents in TLV format) exceeds one block, we fall back to full commit.
Thus at this point, the number of blocks we write per fast commit
transaction is either the total number of files changed (if fast
commit was successfully performed) or the number of blocks that would
be written by a full commit transaction.

- To reduce complexity, there is no support for per-core fast commit areas.

Current design of fast commits is such that we try to perform fast
commits whenever possible but either if it's impossible to record file
system changes by fast commits or if we haven't yet added support for
dealing with a particular type of file system change, we fall back to
full commits. Whenever we later add more features to fast commits, we
probably would need more on-disk format changes for the fast commit
blocks and that would mean we burn feature flags. So, my guess is that
we would need to make a few judgement calls on whether we want to
exclude a few fast commit features, keep the patch series simple and
potentially burn feature flags later OR we save feature flags by
implementing those fast commit features.

On Tue, Jul 23, 2019 at 2:59 PM Andreas Dilger <adilger@dilger.ca> wrote:
>
> On Jul 22, 2019, at 3:02 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> >
> > On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> >> Unless I missed it, this patch series needs a 00/11 email that describes
> >> *what* "fast commit" is, and why we want it.  This should include some
> >> benchmark results, since (I'd assume) that the "fast" part of the feature
> >> name implies a performance improvement?
> >
> > For background, it's a simplified version of the scheme proposed by
> > Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> > for Improving the Latency of Fsync System Call"[1]
> >
> > [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
> >
> > I agree we should have a cover letter for this patch series.  Also, we
> > should add documentation to Documentation/filesystems/journaling.rst
> > about this feature; what it does, how it works, its basic on-disk
> > format changes, etc.
>
> Thanks for the link, I hadn't read that paper previously.  From reading the
> paper, it seems there are some things that should be addressed before the
> patch is committed to the tree in order to maintain proper disk format
> compatibility:
> - the ijournal header shows a 256-byte inode.  In Lustre today (and also
>   Samba and other xattr-intensive workloads) 512- or 1024-byte inodes are used
>   in order to store more xattrs within the inode, so the size of the inode
>   data in the ijournal header needs to match the actual inode size of the
>   filesystem and not be a fixed size.  What if the inode size == blocksize?

Okay, I agree. This is one of such questions where we need to decide
whether to exclude this fast commit feature request for now or not. I
think whether or not we actually support 512 or 1024 byte inodes in
this patch series, we definitely shouldn't assume in the fast commit
header that inodes are of a fixed size. I will fix it. Supporting
bigger inodes doesn't sound like it would result in big change in the
patch series. But I would like to know whether you think it's okay to
wait or not.

> - the ijournal header also shows a 4-byte inode number.  It would be prudent
>   to reserve space for 64-bit inode numbers, or at least have some mechanism
>   (flag) to indicate that a 64-bit inode is stored instead of a 32-bit inode.

Noted, will change that.

> - if there are many cores in a system, say 96, how much space will be used
>   from the journal file by the per-core ijournal?
> - what happens if multiple threads are writing to the same file with ijournal
>   and per-core ijournal areas?  Will the same inode information be recorded
>   in multiple ijournal areas?

As mentioned above, at least for now we keep it simple by not having
per-core fast commit areas.

Thanks,
Harshad

>
> Cheers, Andreas
>
>
>
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-24  6:03       ` harshad shirwadkar
@ 2019-07-24  6:12         ` Darrick J. Wong
  2019-07-24 16:07           ` Theodore Y. Ts'o
  0 siblings, 1 reply; 25+ messages in thread
From: Darrick J. Wong @ 2019-07-24  6:12 UTC (permalink / raw)
  To: harshad shirwadkar
  Cc: Andreas Dilger, Theodore Y. Ts'o, Ext4 Developers List

On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> Before I respond to your questions, I would like to explain how fast
> commits differ from ijournal in a few key aspects (I will make sure to
> explain it in detail in patch 00/11 and documentation):

Please do; I hadn't realized there were also journal ondisk format
changes, and these must be recorded in the ext4 disk format
documentation.

> - Instead of storing extent blocks in a fast commit block, we only
> store extents that were modified in a particular fast commit
> transaction in tag-length-value format.
> 
> - Whenever the fast commit information (inode structure + changed
> extents in TLV format) exceeds one block, we fall back to full commit.
> Thus at this point, the number of blocks we write per fast commit
> transaction is either the total number of files changed (if fast
> commit was successfully performed) or the number of blocks that would
> be written by a full commit transaction.
> 
> - To reduce complexity, there is no support for per-core fast commit areas.
> 
> Current design of fast commits is such that we try to perform fast
> commits whenever possible but either if it's impossible to record file
> system changes by fast commits or if we haven't yet added support for
> dealing with a particular type of file system change, we fall back to
> full commits. Whenever we later add more features to fast commits, we
> probably would need more on-disk format changes for the fast commit
> blocks and that would mean we burn feature flags. So, my guess is that
> we would need to make a few judgement calls on whether we want to
> exclude a few fast commit features, keep the patch series simple and
> potentially burn feature flags later OR we save feature flags by
> implementing those fast commit features.

Every feature flag you add doubles the size of the testing matrix.
If I were you I'd only want to test the (fastcommit) and (!fastcommit)
scenarios.

--D

> On Tue, Jul 23, 2019 at 2:59 PM Andreas Dilger <adilger@dilger.ca> wrote:
> >
> > On Jul 22, 2019, at 3:02 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> > >
> > > On Mon, Jul 22, 2019 at 12:15:11PM -0600, Andreas Dilger wrote:
> > >> Unless I missed it, this patch series needs a 00/11 email that describes
> > >> *what* "fast commit" is, and why we want it.  This should include some
> > >> benchmark results, since (I'd assume) that the "fast" part of the feature
> > >> name implies a performance improvement?
> > >
> > > For background, it's a simplified version of the scheme proposed by
> > > Park and Shin, in their paper, "iJournaling: Fine-Grained Journaling
> > > for Improving the Latency of Fsync System Call"[1]
> > >
> > > [1] https://www.usenix.org/conference/atc17/technical-sessions/presentation/park
> > >
> > > I agree we should have a cover letter for this patch series.  Also, we
> > > should add documentation to Documentation/filesystems/journaling.rst
> > > about this feature; what it does, how it works, its basic on-disk
> > > format changes, etc.
> >
> > Thanks for the link, I hadn't read that paper previously.  From reading the
> > paper, it seems there are some things that should be addressed before the
> > patch is committed to the tree in order to maintain proper disk format
> > compatibility:
> > - the ijournal header shows a 256-byte inode.  In Lustre today (and also
> >   Samba and other xattr-intensive workloads) 512- or 1024-byte inodes are used
> >   in order to store more xattrs within the inode, so the size of the inode
> >   data in the ijournal header needs to match the actual inode size of the
> >   filesystem and not be a fixed size.  What if the inode size == blocksize?
> 
> Okay, I agree. This is one of such questions where we need to decide
> whether to exclude this fast commit feature request for now or not. I
> think whether or not we actually support 512 or 1024 byte inodes in
> this patch series, we definitely shouldn't assume in the fast commit
> header that inodes are of a fixed size. I will fix it. Supporting
> bigger inodes doesn't sound like it would result in big change in the
> patch series. But I would like to know whether you think it's okay to
> wait or not.
> 
> > - the ijournal header also shows a 4-byte inode number.  It would be prudent
> >   to reserve space for 64-bit inode numbers, or at least have some mechanism
> >   (flag) to indicate that a 64-bit inode is stored instead of a 32-bit inode.
> 
> Noted, will change that.
> 
> > - if there are many cores in a system, say 96, how much space will be used
> >   from the journal file by the per-core ijournal?
> > - what happens if multiple threads are writing to the same file with ijournal
> >   and per-core ijournal areas?  Will the same inode information be recorded
> >   in multiple ijournal areas?
> 
> As mentioned above, at least for now we keep it simple by not having
> per-core fast commit areas.
> 
> Thanks,
> Harshad
> 
> >
> > Cheers, Andreas
> >
> >
> >
> >
> >

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-24  6:12         ` Darrick J. Wong
@ 2019-07-24 16:07           ` Theodore Y. Ts'o
  2019-07-24 16:56             ` Darrick J. Wong
  0 siblings, 1 reply; 25+ messages in thread
From: Theodore Y. Ts'o @ 2019-07-24 16:07 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: harshad shirwadkar, Andreas Dilger, Ext4 Developers List

On Tue, Jul 23, 2019 at 11:12:31PM -0700, Darrick J. Wong wrote:
> On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> > Before I respond to your questions, I would like to explain how fast
> > commits differ from ijournal in a few key aspects (I will make sure to
> > explain it in detail in patch 00/11 and documentation):
> 
> Please do; I hadn't realized there were also journal ondisk format
> changes, and these must be recorded in the ext4 disk format
> documentation.

Actually, the changes are almost entirely in the on-disk journal
layer.  The addition of the feature flag is really a UI issue, and
worth some discussion.

One of the goals was to make it easy to allow kernels which didn't
understand fast commit to be able to mount a file system which had
been cleanly unmounted --- but of course, if the file system needs
recovery, and fast commits are in the journal, we can't allow a fast
commit oblivious kernel (or e2fsck) from trying to replay the journal.

One way to do this would be with a mount option, but that's a bit ugly
--- and a mount option in /etc/fstab will cause a failure if a kernel
which doesn't understand that mount option is booted.

So the basic idea is to have a compat feature which means, "please use
fast commits if present", and then when the file system is mounted on
a fast-commit capable kernel, the incompat feature meaning "we're
using the fast commit feature".  (This is same design pattern used
with the HAS_JOURNAL compat feature and the NEEDS_RECOVERY incompat
feature.)

The next question is whether to use the compat and incompat feature
flags in the jbd2 superblock, or ext4-specific compat flags.  For the
incompat flag, there's no reason not to use the journal incompat flag.
But for the compat flag, we have better infrastructure for setting and
clearing ext4-level compat feature flags.  Aside from that, though,
there's no reason why we couldn't use the s_feature_compat field in
the journal superblock --- in which case, *all* of the on-diks format
changes would purely be on the jbd2 side of the ledger.

> Every feature flag you add doubles the size of the testing matrix.
> If I were you I'd only want to test the (fastcommit) and (!fastcommit)
> scenarios.

Sure, absolutely.  On the other hand, as the saying goes, "there comes
a time in any project where it's time to shoot the engineers and put
the d*mned thing into production".  One of the reasons why we're super
interested in this feature is to claw back the performance hit of
fde872682e17 ("ext4: force inode writes when nfsd calls
commit_metadata").  I fully expect that this feature is going to make
big difference to a number of customer workloads, so there is some
urgency to getting this feature into production.

On the flip side, if we leave some performance wins on the table, it's
absolutely true that it makes it harder to add those optimizations
later, and it increases the testing load, not to mention the forwards
and backwards compatibility issues.  It's an engineering trade-off.

    	      		    	     	     - Ted

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-24 16:07           ` Theodore Y. Ts'o
@ 2019-07-24 16:56             ` Darrick J. Wong
  2019-07-24 18:14               ` harshad shirwadkar
  0 siblings, 1 reply; 25+ messages in thread
From: Darrick J. Wong @ 2019-07-24 16:56 UTC (permalink / raw)
  To: Theodore Y. Ts'o
  Cc: harshad shirwadkar, Andreas Dilger, Ext4 Developers List

On Wed, Jul 24, 2019 at 12:07:49PM -0400, Theodore Y. Ts'o wrote:
> On Tue, Jul 23, 2019 at 11:12:31PM -0700, Darrick J. Wong wrote:
> > On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> > > Before I respond to your questions, I would like to explain how fast
> > > commits differ from ijournal in a few key aspects (I will make sure to
> > > explain it in detail in patch 00/11 and documentation):
> > 
> > Please do; I hadn't realized there were also journal ondisk format
> > changes, and these must be recorded in the ext4 disk format
> > documentation.
> 
> Actually, the changes are almost entirely in the on-disk journal
> layer.

I know.

Hmm, just as a reminder -- the ext4 disk format documentation
includes the jbd2 disk format documentation.

> The addition of the feature flag is really a UI issue, and
> worth some discussion.
> 
> One of the goals was to make it easy to allow kernels which didn't
> understand fast commit to be able to mount a file system which had
> been cleanly unmounted --- but of course, if the file system needs
> recovery, and fast commits are in the journal, we can't allow a fast
> commit oblivious kernel (or e2fsck) from trying to replay the journal.

BTW, are there patches to fix e2fsck to replay the factcommit journal?

> One way to do this would be with a mount option, but that's a bit ugly
> --- and a mount option in /etc/fstab will cause a failure if a kernel
> which doesn't understand that mount option is booted.
> 
> So the basic idea is to have a compat feature which means, "please use
> fast commits if present", and then when the file system is mounted on
> a fast-commit capable kernel, the incompat feature meaning "we're
> using the fast commit feature".  (This is same design pattern used
> with the HAS_JOURNAL compat feature and the NEEDS_RECOVERY incompat
> feature.)
> 
> The next question is whether to use the compat and incompat feature
> flags in the jbd2 superblock, or ext4-specific compat flags.  For the
> incompat flag, there's no reason not to use the journal incompat flag.
> But for the compat flag, we have better infrastructure for setting and
> clearing ext4-level compat feature flags.  Aside from that, though,
> there's no reason why we couldn't use the s_feature_compat field in
> the journal superblock --- in which case, *all* of the on-diks format
> changes would purely be on the jbd2 side of the ledger.

Probably better to use the journal compat flag so that the other jbd2
users can take advantage of it ... on the other hand, the only other
user (AFAIK) is ocfs2 and HAH.

> > Every feature flag you add doubles the size of the testing matrix.
> > If I were you I'd only want to test the (fastcommit) and (!fastcommit)
> > scenarios.
> 
> Sure, absolutely.  On the other hand, as the saying goes, "there comes
> a time in any project where it's time to shoot the engineers and put
> the d*mned thing into production".  One of the reasons why we're super
> interested in this feature is to claw back the performance hit of
> fde872682e17 ("ext4: force inode writes when nfsd calls
> commit_metadata").  I fully expect that this feature is going to make
> big difference to a number of customer workloads, so there is some
> urgency to getting this feature into production.
> 
> On the flip side, if we leave some performance wins on the table, it's
> absolutely true that it makes it harder to add those optimizations
> later, and it increases the testing load, not to mention the forwards
> and backwards compatibility issues.  It's an engineering trade-off.

<nod> I just remember hearing you complain about the size of the ext4
testing matrix in the past and figured you would't go for adding
fastcommit in small pieces each with new feature bits.

(I guess you could have a fastcommit_version field that increments every
time you add a new fastcommit journal item to constrain the combinatoric
explosion...)

--D

> 
>     	      		    	     	     - Ted

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-24 16:56             ` Darrick J. Wong
@ 2019-07-24 18:14               ` harshad shirwadkar
  2019-07-25  3:46                 ` Theodore Y. Ts'o
  2019-07-25 20:03                 ` Andreas Dilger
  0 siblings, 2 replies; 25+ messages in thread
From: harshad shirwadkar @ 2019-07-24 18:14 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Theodore Y. Ts'o, Andreas Dilger, Ext4 Developers List

On Wed, Jul 24, 2019 at 9:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>
> On Wed, Jul 24, 2019 at 12:07:49PM -0400, Theodore Y. Ts'o wrote:
> > On Tue, Jul 23, 2019 at 11:12:31PM -0700, Darrick J. Wong wrote:
> > > On Tue, Jul 23, 2019 at 11:03:54PM -0700, harshad shirwadkar wrote:
> > > > Before I respond to your questions, I would like to explain how fast
> > > > commits differ from ijournal in a few key aspects (I will make sure to
> > > > explain it in detail in patch 00/11 and documentation):
> > >
> > > Please do; I hadn't realized there were also journal ondisk format
> > > changes, and these must be recorded in the ext4 disk format
> > > documentation.
> >
> > Actually, the changes are almost entirely in the on-disk journal
> > layer.
>
> I know.
>
> Hmm, just as a reminder -- the ext4 disk format documentation
> includes the jbd2 disk format documentation.
>
> > The addition of the feature flag is really a UI issue, and
> > worth some discussion.
> >
> > One of the goals was to make it easy to allow kernels which didn't
> > understand fast commit to be able to mount a file system which had
> > been cleanly unmounted --- but of course, if the file system needs
> > recovery, and fast commits are in the journal, we can't allow a fast
> > commit oblivious kernel (or e2fsck) from trying to replay the journal.
>
> BTW, are there patches to fix e2fsck to replay the factcommit journal?
>
> > One way to do this would be with a mount option, but that's a bit ugly
> > --- and a mount option in /etc/fstab will cause a failure if a kernel
> > which doesn't understand that mount option is booted.
> >
> > So the basic idea is to have a compat feature which means, "please use
> > fast commits if present", and then when the file system is mounted on
> > a fast-commit capable kernel, the incompat feature meaning "we're
> > using the fast commit feature".  (This is same design pattern used
> > with the HAS_JOURNAL compat feature and the NEEDS_RECOVERY incompat
> > feature.)
> >
> > The next question is whether to use the compat and incompat feature
> > flags in the jbd2 superblock, or ext4-specific compat flags.  For the
> > incompat flag, there's no reason not to use the journal incompat flag.
> > But for the compat flag, we have better infrastructure for setting and
> > clearing ext4-level compat feature flags.  Aside from that, though,
> > there's no reason why we couldn't use the s_feature_compat field in
> > the journal superblock --- in which case, *all* of the on-diks format
> > changes would purely be on the jbd2 side of the ledger.
>
> Probably better to use the journal compat flag so that the other jbd2
> users can take advantage of it ... on the other hand, the only other
> user (AFAIK) is ocfs2 and HAH.
>
> > > Every feature flag you add doubles the size of the testing matrix.
> > > If I were you I'd only want to test the (fastcommit) and (!fastcommit)
> > > scenarios.
> >
> > Sure, absolutely.  On the other hand, as the saying goes, "there comes
> > a time in any project where it's time to shoot the engineers and put
> > the d*mned thing into production".  One of the reasons why we're super
> > interested in this feature is to claw back the performance hit of
> > fde872682e17 ("ext4: force inode writes when nfsd calls
> > commit_metadata").  I fully expect that this feature is going to make
> > big difference to a number of customer workloads, so there is some
> > urgency to getting this feature into production.
> >
> > On the flip side, if we leave some performance wins on the table, it's
> > absolutely true that it makes it harder to add those optimizations
> > later, and it increases the testing load, not to mention the forwards
> > and backwards compatibility issues.  It's an engineering trade-off.
>
> <nod> I just remember hearing you complain about the size of the ext4
> testing matrix in the past and figured you would't go for adding
> fastcommit in small pieces each with new feature bits.
>
> (I guess you could have a fastcommit_version field that increments every
> time you add a new fastcommit journal item to constrain the combinatoric
> explosion...)
I agree, I was going to suggest the same. We would probably need to
add this field in all individual fast commit blocks, since we don't
have a fast commit superblock equivalent .. and changing jbd2
superblock is probably too much to ask for I guess.
>
> --D
>
> >
> >                                            - Ted

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-24 18:14               ` harshad shirwadkar
@ 2019-07-25  3:46                 ` Theodore Y. Ts'o
  2019-07-25 20:03                 ` Andreas Dilger
  1 sibling, 0 replies; 25+ messages in thread
From: Theodore Y. Ts'o @ 2019-07-25  3:46 UTC (permalink / raw)
  To: harshad shirwadkar; +Cc: Darrick J. Wong, Andreas Dilger, Ext4 Developers List

On Wed, Jul 24, 2019 at 11:14:39AM -0700, harshad shirwadkar wrote:
> I agree, I was going to suggest the same. We would probably need to
> add this field in all individual fast commit blocks, since we don't
> have a fast commit superblock equivalent .. and changing jbd2
> superblock is probably too much to ask for I guess.

There's spare space in the jbd2 superblock.  Just grab a byte from
s_padding2...

					- Ted

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 01/11] ext4: add handling for extended mount options
  2019-07-24 18:14               ` harshad shirwadkar
  2019-07-25  3:46                 ` Theodore Y. Ts'o
@ 2019-07-25 20:03                 ` Andreas Dilger
  1 sibling, 0 replies; 25+ messages in thread
From: Andreas Dilger @ 2019-07-25 20:03 UTC (permalink / raw)
  To: harshad shirwadkar
  Cc: Darrick J. Wong, Theodore Y. Ts'o, Ext4 Developers List

[-- Attachment #1: Type: text/plain, Size: 1326 bytes --]

On Jul 24, 2019, at 12:14 PM, harshad shirwadkar <harshadshirwadkar@gmail.com> wrote:
> 
> On Wed, Jul 24, 2019 at 9:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>> 
>> (I guess you could have a fastcommit_version field that increments every
>> time you add a new fastcommit journal item to constrain the combinatoric
>> explosion...)
> 
> I agree, I was going to suggest the same. We would probably need to
> add this field in all individual fast commit blocks, since we don't
> have a fast commit superblock equivalent .. and changing jbd2
> superblock is probably too much to ask for I guess.

As a general design rule, whenever you see/think "version number", you
should instead use "feature flags" as is done in the ext2/3/4 superblock.
This doesn't take any more space, and is much more flexible.

This is a _much_ more flexible paradigm for compatibility, and doesn't
require the "version X must support every version <= X" behaviour that
a version number does.  With feature flags you can support feature bit
a+b+c, and if feature B is no longer useful it can be deprecated without
affecting the use of feature A or C.  It also makes it clear that bits
a+b+c are using features A+B+C, while storing "version 7" isn't clear
which feature is in use/needed.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2019-07-25 20:03 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-22  4:00 [PATCH 01/11] ext4: add handling for extended mount options Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 02/11] jbd2: add fast commit fields to journal_s structure Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 03/11] jbd2: fast commit setup and enable Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 04/11] jbd2: fast-commit commit path changes Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 05/11] jbd2: fast-commit commit path new APIs Harshad Shirwadkar
2019-07-22 17:45   ` kbuild test robot
2019-07-22 21:02   ` kbuild test robot
2019-07-22  4:00 ` [PATCH 06/11] jbd2: fast-commit recovery path changes Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 07/11] ext4: add fields that are needed to track changed files Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 08/11] ext4: track changed files for fast commit Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 09/11] ext4: fast-commit commit range tracking Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 10/11] ext4: fast-commit commit path changes Harshad Shirwadkar
2019-07-22  4:00 ` [PATCH 11/11] ext4: fast-commit recovery " Harshad Shirwadkar
2019-07-22 21:34   ` kbuild test robot
2019-07-22 18:15 ` [PATCH 01/11] ext4: add handling for extended mount options Andreas Dilger
2019-07-22 21:02   ` Theodore Y. Ts'o
2019-07-22 21:36     ` harshad shirwadkar
2019-07-23 21:59     ` Andreas Dilger
2019-07-24  6:03       ` harshad shirwadkar
2019-07-24  6:12         ` Darrick J. Wong
2019-07-24 16:07           ` Theodore Y. Ts'o
2019-07-24 16:56             ` Darrick J. Wong
2019-07-24 18:14               ` harshad shirwadkar
2019-07-25  3:46                 ` Theodore Y. Ts'o
2019-07-25 20:03                 ` Andreas Dilger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).