All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
       [not found] <164ffa3b-c4d5-6967-feba-b972995a6dfb@gmail.com>
@ 2021-05-26  8:42 ` Wang Jianchao
  2021-05-27 19:47   ` Andreas Dilger
  2021-06-23 13:13   ` Theodore Ts'o
       [not found] ` <a602a6ba-2073-8384-4c8f-d669ee25c065@gmail.com>
  1 sibling, 2 replies; 21+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:42 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Get rid of the 'group' parameter of ext4_trim_extent as we can get
it from the 'e4b'.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a02fadf..d81f1fd22 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5650,19 +5650,19 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
  * @sb:		super block for the file system
  * @start:	starting block of the free extent in the alloc. group
  * @count:	number of blocks to TRIM
- * @group:	alloc. group we are working with
  * @e4b:	ext4 buddy for the group
  *
  * Trim "count" blocks starting at "start" in the "group". To assure that no
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
-			     ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb,
+		int start, int count, struct ext4_buddy *e4b)
 __releases(bitlock)
 __acquires(bitlock)
 {
 	struct ext4_free_extent ex;
+	ext4_group_t group = e4b->bd_group;
 	int ret = 0;
 
 	trace_ext4_trim_extent(sb, group, start, count);
@@ -5738,8 +5738,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 		next = mb_find_next_bit(bitmap, max + 1, start);
 
 		if ((next - start) >= minblocks) {
-			ret = ext4_trim_extent(sb, start,
-					       next - start, group, &e4b);
+			ret = ext4_trim_extent(sb, start, next - start, &e4b);
 			if (ret && ret != -EOPNOTSUPP)
 				break;
 			ret = 0;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH V2 2/7] ext4: add new helper interface ext4_try_to_trim_range()
       [not found] ` <a602a6ba-2073-8384-4c8f-d669ee25c065@gmail.com>
@ 2021-05-26  8:43   ` Wang Jianchao
  2021-05-27 19:48     ` Andreas Dilger
       [not found]   ` <49382052-6238-f1fb-40d1-b6b801b39ff7@gmail.com>
  1 sibling, 1 reply; 21+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

There is no functional change in this patch but just split the
codes, which serachs free block and does trim, into a new function
ext4_try_to_trim_range. This is preparing for the following async
backgroup discard.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 102 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 57 insertions(+), 45 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d81f1fd22..f984f15 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5685,6 +5685,54 @@ static int ext4_trim_extent(struct super_block *sb,
 	return ret;
 }
 
+static int ext4_try_to_trim_range(struct super_block *sb,
+		struct ext4_buddy *e4b, ext4_grpblk_t start,
+		ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+	ext4_grpblk_t next, count, free_count;
+	void *bitmap;
+	int ret = 0;
+
+	bitmap = e4b->bd_bitmap;
+	start = (e4b->bd_info->bb_first_free > start) ?
+		e4b->bd_info->bb_first_free : start;
+	count = 0;
+	free_count = 0;
+
+	while (start <= max) {
+		start = mb_find_next_zero_bit(bitmap, max + 1, start);
+		if (start > max)
+			break;
+		next = mb_find_next_bit(bitmap, max + 1, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start, next - start, e4b);
+			if (ret && ret != -EOPNOTSUPP)
+				break;
+			ret = 0;
+			count += next - start;
+		}
+		free_count += next - start;
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, e4b->bd_group);
+			cond_resched();
+			ext4_lock_group(sb, e4b->bd_group);
+		}
+
+		if ((e4b->bd_info->bb_free - free_count) < minblocks)
+			break;
+	}
+
+	return count;
+}
+
 /**
  * ext4_trim_all_free -- function to trim all free space in alloc. group
  * @sb:			super block for file system
@@ -5708,10 +5756,8 @@ static int ext4_trim_extent(struct super_block *sb,
 		   ext4_grpblk_t start, ext4_grpblk_t max,
 		   ext4_grpblk_t minblocks)
 {
-	void *bitmap;
-	ext4_grpblk_t next, count = 0, free_count = 0;
 	struct ext4_buddy e4b;
-	int ret = 0;
+	int ret;
 
 	trace_ext4_trim_all_free(sb, group, start, max);
 
@@ -5721,57 +5767,23 @@ static int ext4_trim_extent(struct super_block *sb,
 			     ret, group);
 		return ret;
 	}
-	bitmap = e4b.bd_bitmap;
 
 	ext4_lock_group(sb, group);
-	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
-	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
-		goto out;
-
-	start = (e4b.bd_info->bb_first_free > start) ?
-		e4b.bd_info->bb_first_free : start;
 
-	while (start <= max) {
-		start = mb_find_next_zero_bit(bitmap, max + 1, start);
-		if (start > max)
-			break;
-		next = mb_find_next_bit(bitmap, max + 1, start);
-
-		if ((next - start) >= minblocks) {
-			ret = ext4_trim_extent(sb, start, next - start, &e4b);
-			if (ret && ret != -EOPNOTSUPP)
-				break;
-			ret = 0;
-			count += next - start;
-		}
-		free_count += next - start;
-		start = next + 1;
-
-		if (fatal_signal_pending(current)) {
-			count = -ERESTARTSYS;
-			break;
-		}
-
-		if (need_resched()) {
-			ext4_unlock_group(sb, group);
-			cond_resched();
-			ext4_lock_group(sb, group);
-		}
-
-		if ((e4b.bd_info->bb_free - free_count) < minblocks)
-			break;
+	if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+	    minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) {
+		ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
+		if (ret >= 0)
+			EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+	} else {
+		ret = 0;
 	}
 
-	if (!ret) {
-		ret = count;
-		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
-	}
-out:
 	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
 
 	ext4_debug("trimmed %d blocks in the group %d\n",
-		count, group);
+		ret, group);
 
 	return ret;
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH V2 3/7] ext4: remove the repeated comment of ext4_trim_all_free
       [not found]   ` <49382052-6238-f1fb-40d1-b6b801b39ff7@gmail.com>
@ 2021-05-26  8:43     ` Wang Jianchao
  2021-05-27 19:49       ` Andreas Dilger
       [not found]     ` <48e33dea-d15e-f211-0191-e01bd3eb17b3@gmail.com>
  1 sibling, 1 reply; 21+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f984f15..85418cf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5741,15 +5741,10 @@ static int ext4_try_to_trim_range(struct super_block *sb,
  * @max:		last group block to examine
  * @minblocks:		minimum extent block count
  *
- * ext4_trim_all_free walks through group's buddy bitmap searching for free
- * extents. When the free block is found, ext4_trim_extent is called to TRIM
- * the extent.
- *
- *
  * ext4_trim_all_free walks through group's block bitmap searching for free
  * extents. When the free extent is found, mark it as used in group buddy
  * bitmap. Then issue a TRIM command on this extent and free the extent in
- * the group buddy bitmap. This is done until whole group is scanned.
+ * the group buddy bitmap.
  */
 static ext4_grpblk_t
 ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data
       [not found]     ` <48e33dea-d15e-f211-0191-e01bd3eb17b3@gmail.com>
@ 2021-05-26  8:43       ` Wang Jianchao
  2021-05-27 20:09         ` Andreas Dilger
       [not found]       ` <67eeb65a-d413-c4f9-c06f-d5dcceca0e4f@gmail.com>
  1 sibling, 1 reply; 21+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Split the codes that inserts and merges ext4_free_data structures
into a new interface ext4_insert_free_data. This is preparing for
following async background discard.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 96 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 51 insertions(+), 45 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 85418cf..16f06d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -350,6 +350,12 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+static inline struct ext4_free_data *efd_entry(struct rb_node *n)
+{
+	return rb_entry_safe(n, struct ext4_free_data, efd_node);
+}
+static int ext4_insert_free_data(struct ext4_sb_info *sbi,
+		struct rb_root *root, struct ext4_free_data *nfd);
 
 /*
  * The algorithm using this percpu seq counter goes below:
@@ -5069,28 +5075,53 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
 	kmem_cache_free(ext4_free_data_cachep, entry);
 }
 
+static int ext4_insert_free_data(struct ext4_sb_info *sbi,
+		struct rb_root *root, struct ext4_free_data *nfd)
+{
+	struct rb_node **n = &root->rb_node;
+	struct rb_node *p = NULL;
+	struct ext4_free_data *fd;
+
+	while (*n) {
+		p = *n;
+		fd = rb_entry(p, struct ext4_free_data, efd_node);
+		if (nfd->efd_start_cluster < fd->efd_start_cluster)
+			n = &(*n)->rb_left;
+		else if (nfd->efd_start_cluster >=
+			 (fd->efd_start_cluster + fd->efd_count))
+			n = &(*n)->rb_right;
+		else
+			return -EINVAL;
+	}
+
+	rb_link_node(&nfd->efd_node, p, n);
+	rb_insert_color(&nfd->efd_node, root);
+
+	/* Now try to see the extent can be merged to left and right */
+	fd = efd_entry(rb_prev(&nfd->efd_node));
+	if (fd)
+		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+
+	fd = efd_entry(rb_next(&nfd->efd_node));
+	if (fd)
+		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+
+	return 0;
+}
+
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-		      struct ext4_free_data *new_entry)
+		      struct ext4_free_data *nfd)
 {
-	ext4_group_t group = e4b->bd_group;
-	ext4_grpblk_t cluster;
-	ext4_grpblk_t clusters = new_entry->efd_count;
-	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct rb_node **n = &db->bb_free_root.rb_node, *node;
-	struct rb_node *parent = NULL, *new_node;
 
 	BUG_ON(!ext4_handle_valid(handle));
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	new_node = &new_entry->efd_node;
-	cluster = new_entry->efd_start_cluster;
-
-	if (!*n) {
+	if (!db->bb_free_root.rb_node) {
 		/* first free block exent. We need to
 		   protect buddy cache from being freed,
 		 * otherwise we'll refresh it from
@@ -5099,44 +5130,19 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
 		get_page(e4b->bd_buddy_page);
 		get_page(e4b->bd_bitmap_page);
 	}
-	while (*n) {
-		parent = *n;
-		entry = rb_entry(parent, struct ext4_free_data, efd_node);
-		if (cluster < entry->efd_start_cluster)
-			n = &(*n)->rb_left;
-		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
-			n = &(*n)->rb_right;
-		else {
-			ext4_grp_locked_error(sb, group, 0,
-				ext4_group_first_block_no(sb, group) +
-				EXT4_C2B(sbi, cluster),
-				"Block already on to-be-freed list");
-			kmem_cache_free(ext4_free_data_cachep, new_entry);
-			return 0;
-		}
-	}
-
-	rb_link_node(new_node, parent, n);
-	rb_insert_color(new_node, &db->bb_free_root);
-
-	/* Now try to see the extent can be merged to left and right */
-	node = rb_prev(new_node);
-	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, efd_node);
-		ext4_try_merge_freed_extent(sbi, entry, new_entry,
-					    &(db->bb_free_root));
-	}
 
-	node = rb_next(new_node);
-	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, efd_node);
-		ext4_try_merge_freed_extent(sbi, entry, new_entry,
-					    &(db->bb_free_root));
+	if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
+		ext4_grp_locked_error(sb, e4b->bd_group, 0,
+				ext4_group_first_block_no(sb, e4b->bd_group) +
+				EXT4_C2B(sbi, nfd->efd_start_cluster),
+				"Block already on to-be-freed list");
+		kmem_cache_free(ext4_free_data_cachep, nfd);
+		return 0;
 	}
 
 	spin_lock(&sbi->s_md_lock);
-	list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
-	sbi->s_mb_free_pending += clusters;
+	list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
+	sbi->s_mb_free_pending += nfd->efd_count;
 	spin_unlock(&sbi->s_md_lock);
 	return 0;
 }
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH V2 5/7] ext4: get buddy cache after insert successfully
       [not found]       ` <67eeb65a-d413-c4f9-c06f-d5dcceca0e4f@gmail.com>
@ 2021-05-26  8:43         ` Wang Jianchao
  2021-06-23  3:06           ` Theodore Ts'o
       [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
  1 sibling, 1 reply; 21+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

The getting of bd_bitmap_page and bd_buddy_page should be done
after insert the ext4_free_data successfully. Otherwise, nobody
can put them.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b23010c..3e8ad43 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5116,21 +5116,12 @@ static int ext4_insert_free_data( struct ext4_sb_info *sbi,
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	bool first = RB_EMPTY_ROOT(&db->bb_free_root);
 
 	BUG_ON(!ext4_handle_valid(handle));
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	if (!db->bb_free_root.rb_node) {
-		/* first free block exent. We need to
-		   protect buddy cache from being freed,
-		 * otherwise we'll refresh it from
-		 * on-disk bitmap and lose not-yet-available
-		 * blocks */
-		get_page(e4b->bd_buddy_page);
-		get_page(e4b->bd_bitmap_page);
-	}
-
 	if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
 		ext4_grp_locked_error(sb, e4b->bd_group, 0,
 				ext4_group_first_block_no(sb, e4b->bd_group) +
@@ -5140,6 +5131,15 @@ static int ext4_insert_free_data( struct ext4_sb_info *sbi,
 		return 0;
 	}
 
+	if (first) {
+		/* first free block exent. We need to protect buddy
+		 * cache from being freed, otherwise we'll refresh it
+		 * from on-disk bitmap and lose not-yet-available blocks
+		 */
+		get_page(e4b->bd_buddy_page);
+		get_page(e4b->bd_bitmap_page);
+	}
+
 	spin_lock(&sbi->s_md_lock);
 	list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
 	sbi->s_mb_free_pending += nfd->efd_count;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH V2 6/7] ext4: use bb_free_root to get the free data entry
       [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
@ 2021-05-26  8:43           ` Wang Jianchao
  2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
  1 sibling, 0 replies; 21+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

This is also preparing for following async background discard.
In this patch, the s_freed_data_list is removed and we iterate
all of the group's free_data_root rb tree to get the entry.
After this, we needn't operate it when insert and merge free
data entry any more.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/balloc.c  |   2 +-
 fs/ext4/ext4.h    |   4 +--
 fs/ext4/mballoc.c | 104 +++++++++++++++++++++++++-----------------------------
 3 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 74a5172..8053a5f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -652,7 +652,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	 * possible we just missed a transaction commit that did so
 	 */
 	smp_mb();
-	if (sbi->s_mb_free_pending == 0)
+	if (!atomic_read(&sbi->s_mb_free_pending))
 		return ext4_has_free_clusters(sbi, 1, 0);
 
 	/*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 826a56e3..5c5c8e4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1525,9 +1525,7 @@ struct ext4_sb_info {
 	unsigned short *s_mb_offsets;
 	unsigned int *s_mb_maxs;
 	unsigned int s_group_info_size;
-	unsigned int s_mb_free_pending;
-	struct list_head s_freed_data_list;	/* List of blocks to be freed
-						   after commit completed */
+	atomic_t s_mb_free_pending;
 
 	/* tunables */
 	unsigned long s_stripe;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c2bf40a..15715e7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -354,8 +354,7 @@ static inline struct ext4_free_data *efd_entry(struct rb_node *n)
 {
 	return rb_entry_safe(n, struct ext4_free_data, efd_node);
 }
-static int ext4_insert_free_data(struct ext4_sb_info *sbi,
-		struct rb_root *root, struct ext4_free_data *nfd);
+static int ext4_insert_free_data(struct rb_root *root, struct ext4_free_data *nfd);
 
 /*
  * The algorithm using this percpu seq counter goes below:
@@ -2857,8 +2856,7 @@ int ext4_mb_init(struct super_block *sb)
 
 	spin_lock_init(&sbi->s_md_lock);
 	spin_lock_init(&sbi->s_bal_lock);
-	sbi->s_mb_free_pending = 0;
-	INIT_LIST_HEAD(&sbi->s_freed_data_list);
+	atomic_set(&sbi->s_mb_free_pending, 0);
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -3040,9 +3038,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 	/* we expect to find existing buddy because it's pinned */
 	BUG_ON(err != 0);
 
-	spin_lock(&EXT4_SB(sb)->s_md_lock);
-	EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
-	spin_unlock(&EXT4_SB(sb)->s_md_lock);
+	atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
 
 	db = e4b.bd_info;
 	/* there are blocks to put in buddy to make them really free */
@@ -3084,37 +3080,41 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_data *entry, *tmp;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_free_data *fd, *nfd;
+	struct ext4_group_info *grp;
 	struct bio *discard_bio = NULL;
 	struct list_head freed_data_list;
-	struct list_head *cut_pos = NULL;
-	int err;
+	int err, i;
 
-	INIT_LIST_HEAD(&freed_data_list);
+	if (!atomic_read(&sbi->s_mb_free_pending))
+		return;
 
-	spin_lock(&sbi->s_md_lock);
-	list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
-		if (entry->efd_tid != commit_tid)
-			break;
-		cut_pos = &entry->efd_list;
+	INIT_LIST_HEAD(&freed_data_list);
+	for (i = 0; i < ngroups; i++) {
+		grp = ext4_get_group_info(sb, i);
+		ext4_lock_group(sb, i);
+		rbtree_postorder_for_each_entry_safe(fd, nfd,
+				&grp->bb_free_root, efd_node) {
+			if (fd->efd_tid != commit_tid)
+				continue;
+			INIT_LIST_HEAD(&fd->efd_list);
+			list_add_tail(&fd->efd_list, &freed_data_list);
+		}
+		ext4_unlock_group(sb, i);
 	}
-	if (cut_pos)
-		list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
-				  cut_pos);
-	spin_unlock(&sbi->s_md_lock);
 
 	if (test_opt(sb, DISCARD)) {
-		list_for_each_entry(entry, &freed_data_list, efd_list) {
-			err = ext4_issue_discard(sb, entry->efd_group,
-						 entry->efd_start_cluster,
-						 entry->efd_count,
+		list_for_each_entry(fd, &freed_data_list, efd_list) {
+			err = ext4_issue_discard(sb, fd->efd_group,
+						 fd->efd_start_cluster,
+						 fd->efd_count,
 						 &discard_bio);
 			if (err && err != -EOPNOTSUPP) {
 				ext4_msg(sb, KERN_WARNING, "discard request in"
 					 " group:%d block:%d count:%d failed"
-					 " with %d", entry->efd_group,
-					 entry->efd_start_cluster,
-					 entry->efd_count, err);
+					 " with %d", fd->efd_group,
+					 fd->efd_start_cluster, fd->efd_count, err);
 			} else if (err == -EOPNOTSUPP)
 				break;
 		}
@@ -3125,8 +3125,8 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 		}
 	}
 
-	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
-		ext4_free_data_in_buddy(sb, entry);
+	list_for_each_entry_safe(fd, nfd, &freed_data_list, efd_list)
+		ext4_free_data_in_buddy(sb, fd);
 }
 
 int __init ext4_init_mballoc(void)
@@ -5051,32 +5051,27 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
  * are contiguous, AND the extents were freed by the same transaction,
  * AND the blocks are associated with the same group.
  */
-static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
-					struct ext4_free_data *entry,
-					struct ext4_free_data *new_entry,
-					struct rb_root *entry_rb_root)
+static void ext4_try_merge_freed_extent(struct rb_root *root,
+	struct ext4_free_data *fd, struct ext4_free_data *nfd)
 {
-	if ((entry->efd_tid != new_entry->efd_tid) ||
-	    (entry->efd_group != new_entry->efd_group))
+	if ((fd->efd_tid != nfd->efd_tid) ||
+	    (fd->efd_group != nfd->efd_group))
 		return;
-	if (entry->efd_start_cluster + entry->efd_count ==
-	    new_entry->efd_start_cluster) {
-		new_entry->efd_start_cluster = entry->efd_start_cluster;
-		new_entry->efd_count += entry->efd_count;
-	} else if (new_entry->efd_start_cluster + new_entry->efd_count ==
-		   entry->efd_start_cluster) {
-		new_entry->efd_count += entry->efd_count;
+	if (fd->efd_start_cluster + fd->efd_count ==
+	    nfd->efd_start_cluster) {
+		nfd->efd_start_cluster = fd->efd_start_cluster;
+		nfd->efd_count += fd->efd_count;
+	} else if (nfd->efd_start_cluster + nfd->efd_count ==
+		   fd->efd_start_cluster) {
+		nfd->efd_count += fd->efd_count;
 	} else
 		return;
-	spin_lock(&sbi->s_md_lock);
-	list_del(&entry->efd_list);
-	spin_unlock(&sbi->s_md_lock);
-	rb_erase(&entry->efd_node, entry_rb_root);
-	kmem_cache_free(ext4_free_data_cachep, entry);
+	rb_erase(&fd->efd_node, root);
+	kmem_cache_free(ext4_free_data_cachep, fd);
 }
 
-static int ext4_insert_free_data(struct ext4_sb_info *sbi,
-		struct rb_root *root, struct ext4_free_data *nfd)
+static int ext4_insert_free_data(struct rb_root *root,
+		struct ext4_free_data *nfd)
 {
 	struct rb_node **n = &root->rb_node;
 	struct rb_node *p = NULL;
@@ -5100,11 +5095,11 @@ static int ext4_insert_free_data(struct ext4_sb_info *sbi,
 	/* Now try to see the extent can be merged to left and right */
 	fd = efd_entry(rb_prev(&nfd->efd_node));
 	if (fd)
-		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+		ext4_try_merge_freed_extent(root, fd, nfd);
 
 	fd = efd_entry(rb_next(&nfd->efd_node));
 	if (fd)
-		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+		ext4_try_merge_freed_extent(root, fd, nfd);
 
 	return 0;
 }
@@ -5122,7 +5117,7 @@ static int ext4_insert_free_data(struct ext4_sb_info *sbi,
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
+	if (ext4_insert_free_data(&db->bb_free_root, nfd)) {
 		ext4_grp_locked_error(sb, e4b->bd_group, 0,
 				ext4_group_first_block_no(sb, e4b->bd_group) +
 				EXT4_C2B(sbi, nfd->efd_start_cluster),
@@ -5140,10 +5135,7 @@ static int ext4_insert_free_data(struct ext4_sb_info *sbi,
 		get_page(e4b->bd_bitmap_page);
 	}
 
-	spin_lock(&sbi->s_md_lock);
-	list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
-	sbi->s_mb_free_pending += nfd->efd_count;
-	spin_unlock(&sbi->s_md_lock);
+	atomic_add(nfd->efd_count, &sbi->s_mb_free_pending);
 	return 0;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
       [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
  2021-05-26  8:43           ` [PATCH V2 6/7] ext4: use bb_free_root to get the free data entry Wang Jianchao
@ 2021-05-26  8:44           ` Wang Jianchao
  2021-05-27 20:18             ` Andreas Dilger
                               ` (2 more replies)
  1 sibling, 3 replies; 21+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:44 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Right now, discard is issued and waited to be completed in jbd2
commit kthread context after the logs are committed. When large
amount of files are deleted and discard is flooding, jbd2 commit
kthread can be blocked for long time. Then all of the metadata
operations can be blocked to wait the log space.

One case is the page fault path with read mm->mmap_sem held, which
wants to update the file time but has to wait for the log space.
When other threads in the task wants to do mmap, then write mmap_sem
is blocked. Finally all of the following read mmap_sem requirements
are blocked, even the ps command which need to read the /proc/pid/
-cmdline. Our monitor service which needs to read /proc/pid/cmdline
used to be blocked for 5 mins.

This patch frees the blocks back to buddy after commit and then do
discard in a async kworker context in fstrim fashion, namely,
 - mark blocks to be discarded as used if they have not been allocated
 - do discard
 - mark them free
After this, jbd2 commit kthread won't be blocked any more by discard
and we won't get NOSPC even if the discard is slow or throttled.

Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
Suggested-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/ext4.h    |   2 +
 fs/ext4/mballoc.c | 162 +++++++++++++++++++++++++++++++++---------------------
 fs/ext4/mballoc.h |   3 -
 3 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5c5c8e4..2e48c88 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1526,6 +1526,7 @@ struct ext4_sb_info {
 	unsigned int *s_mb_maxs;
 	unsigned int s_group_info_size;
 	atomic_t s_mb_free_pending;
+	struct work_struct s_discard_work;
 
 	/* tunables */
 	unsigned long s_stripe;
@@ -3306,6 +3307,7 @@ struct ext4_group_info {
 	unsigned long	bb_check_counter;
 #endif
 	struct rb_root  bb_free_root;
+	struct rb_root  bb_discard_root;
 	ext4_grpblk_t	bb_first_free;	/* first free block */
 	ext4_grpblk_t	bb_free;	/* total free blocks */
 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 15715e7..feea439 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -332,6 +332,7 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_data_cachep;
+static struct workqueue_struct *ext4_discard_wq;
 
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
@@ -355,7 +356,10 @@ static inline struct ext4_free_data *efd_entry(struct rb_node *n)
 	return rb_entry_safe(n, struct ext4_free_data, efd_node);
 }
 static int ext4_insert_free_data(struct rb_root *root, struct ext4_free_data *nfd);
-
+static int ext4_try_to_trim_range(struct super_block *sb,
+		struct ext4_buddy *e4b, ext4_grpblk_t start,
+		ext4_grpblk_t max, ext4_grpblk_t minblocks);
+static void ext4_discard_work(struct work_struct *work);
 /*
  * The algorithm using this percpu seq counter goes below:
  * 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -2657,6 +2661,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_discard_root = RB_ROOT;
 	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 
 	mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
@@ -2857,6 +2862,7 @@ int ext4_mb_init(struct super_block *sb)
 	spin_lock_init(&sbi->s_md_lock);
 	spin_lock_init(&sbi->s_bal_lock);
 	atomic_set(&sbi->s_mb_free_pending, 0);
+	INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -2949,6 +2955,15 @@ int ext4_mb_release(struct super_block *sb)
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 	int count;
 
+	if (test_opt(sb, DISCARD)) {
+		/*
+		 * wait the discard work to drain all of ext4_free_data
+		 */
+		queue_work(ext4_discard_wq, &sbi->s_discard_work);
+		flush_work(&sbi->s_discard_work);
+	}
+
+	queue_work(ext4_discard_wq, &sbi->s_discard_work);
 	if (sbi->s_group_info) {
 		for (i = 0; i < ngroups; i++) {
 			cond_resched();
@@ -3024,30 +3039,77 @@ static inline int ext4_issue_discard(struct super_block *sb,
 		return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 }
 
-static void ext4_free_data_in_buddy(struct super_block *sb,
-				    struct ext4_free_data *entry)
+static void ext4_discard_work(struct work_struct *work)
 {
+	struct ext4_sb_info *sbi = container_of(work,
+			struct ext4_sb_info, s_discard_work);
+	struct super_block *sb = sbi->s_sb;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_info *grp;
+	struct ext4_free_data *fd, *nfd;
 	struct ext4_buddy e4b;
-	struct ext4_group_info *db;
-	int err, count = 0, count2 = 0;
+	int i, err;
+
+	for (i = 0; i < ngroups; i++) {
+		grp = ext4_get_group_info(sb, i);
+		if (RB_EMPTY_ROOT(&grp->bb_discard_root))
+			continue;
 
-	mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
-		 entry->efd_count, entry->efd_group, entry);
+		err = ext4_mb_load_buddy(sb, i, &e4b);
+		if (err) {
+			ext4_warning(sb, "Error %d loading buddy information for %u",
+					err, i);
+			continue;
+		}
 
-	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
-	/* we expect to find existing buddy because it's pinned */
-	BUG_ON(err != 0);
+		ext4_lock_group(sb, i);
+		rbtree_postorder_for_each_entry_safe(fd, nfd,
+				&grp->bb_discard_root, efd_node) {
+			rb_erase(&fd->efd_node, &grp->bb_discard_root);
+			/*
+			 * If filesystem is umounting, give up the discard
+			 */
+			if (sb->s_flags & SB_ACTIVE)
+				ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
+						fd->efd_start_cluster + fd->efd_count - 1, 1);
+			kmem_cache_free(ext4_free_data_cachep, fd);
+		}
+		ext4_unlock_group(sb, i);
+		ext4_mb_unload_buddy(&e4b);
+	}
+}
 
-	atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
+static int ext4_free_data_in_buddy(struct super_block *sb,
+		ext4_group_t group, tid_t commit_tid)
+{
+	struct ext4_buddy e4b;
+	struct ext4_group_info *db;
+	struct ext4_free_data *fd, *nfd;
+	int cnt, nr_fd;
 
+	/* we expect to find existing buddy because it's pinned */
+	BUG_ON(ext4_mb_load_buddy(sb, group, &e4b));
 	db = e4b.bd_info;
-	/* there are blocks to put in buddy to make them really free */
-	count += entry->efd_count;
-	count2++;
-	ext4_lock_group(sb, entry->efd_group);
-	/* Take it out of per group rb tree */
-	rb_erase(&entry->efd_node, &(db->bb_free_root));
-	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
+	cnt = 0;
+	nr_fd = 0;
+	ext4_lock_group(sb, group);
+	rbtree_postorder_for_each_entry_safe(fd, nfd,
+			&db->bb_free_root, efd_node) {
+		if (fd->efd_tid != commit_tid)
+			continue;
+
+		mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
+			 fd->efd_count, fd->efd_group, fd);
+		atomic_sub(fd->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
+		cnt += fd->efd_count;
+		nr_fd++;
+		rb_erase(&fd->efd_node, &db->bb_free_root);
+		mb_free_blocks(NULL, &e4b, fd->efd_start_cluster, fd->efd_count);
+		if (test_opt(sb, DISCARD))
+			ext4_insert_free_data(&db->bb_discard_root, fd);
+		else
+			kmem_cache_free(ext4_free_data_cachep, fd);
+	}
 
 	/*
 	 * Clear the trimmed flag for the group so that the next
@@ -3055,22 +3117,22 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 	 * If the volume is mounted with -o discard, online discard
 	 * is supported and the free blocks will be trimmed online.
 	 */
-	if (!test_opt(sb, DISCARD))
+	if (!test_opt(sb, DISCARD) && cnt)
 		EXT4_MB_GRP_CLEAR_TRIMMED(db);
 
-	if (!db->bb_free_root.rb_node) {
+	if (RB_EMPTY_ROOT(&db->bb_free_root) && cnt) {
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
 		 */
 		put_page(e4b.bd_buddy_page);
 		put_page(e4b.bd_bitmap_page);
 	}
-	ext4_unlock_group(sb, entry->efd_group);
-	kmem_cache_free(ext4_free_data_cachep, entry);
+	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
 
-	mb_debug(sb, "freed %d blocks in %d structures\n", count,
-		 count2);
+	mb_debug(sb, "freed %d blocks in %d structures\n", cnt, nr_fd);
+
+	return nr_fd;
 }
 
 /*
@@ -3081,52 +3143,21 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	struct ext4_free_data *fd, *nfd;
 	struct ext4_group_info *grp;
-	struct bio *discard_bio = NULL;
-	struct list_head freed_data_list;
-	int err, i;
+	int i, nr;
 
 	if (!atomic_read(&sbi->s_mb_free_pending))
 		return;
 
-	INIT_LIST_HEAD(&freed_data_list);
-	for (i = 0; i < ngroups; i++) {
+	for (i = 0, nr = 0; i < ngroups; i++) {
 		grp = ext4_get_group_info(sb, i);
-		ext4_lock_group(sb, i);
-		rbtree_postorder_for_each_entry_safe(fd, nfd,
-				&grp->bb_free_root, efd_node) {
-			if (fd->efd_tid != commit_tid)
-				continue;
-			INIT_LIST_HEAD(&fd->efd_list);
-			list_add_tail(&fd->efd_list, &freed_data_list);
-		}
-		ext4_unlock_group(sb, i);
-	}
-
-	if (test_opt(sb, DISCARD)) {
-		list_for_each_entry(fd, &freed_data_list, efd_list) {
-			err = ext4_issue_discard(sb, fd->efd_group,
-						 fd->efd_start_cluster,
-						 fd->efd_count,
-						 &discard_bio);
-			if (err && err != -EOPNOTSUPP) {
-				ext4_msg(sb, KERN_WARNING, "discard request in"
-					 " group:%d block:%d count:%d failed"
-					 " with %d", fd->efd_group,
-					 fd->efd_start_cluster, fd->efd_count, err);
-			} else if (err == -EOPNOTSUPP)
-				break;
-		}
-
-		if (discard_bio) {
-			submit_bio_wait(discard_bio);
-			bio_put(discard_bio);
-		}
+		if (RB_EMPTY_ROOT(&grp->bb_free_root))
+			continue;
+		nr += ext4_free_data_in_buddy(sb, i, commit_tid);
 	}
 
-	list_for_each_entry_safe(fd, nfd, &freed_data_list, efd_list)
-		ext4_free_data_in_buddy(sb, fd);
+	if (nr && test_opt(sb, DISCARD))
+		queue_work(ext4_discard_wq, &sbi->s_discard_work);
 }
 
 int __init ext4_init_mballoc(void)
@@ -3146,8 +3177,13 @@ int __init ext4_init_mballoc(void)
 	if (ext4_free_data_cachep == NULL)
 		goto out_ac_free;
 
-	return 0;
+	ext4_discard_wq = alloc_workqueue("ext4discard", WQ_UNBOUND, 0);
+	if (!ext4_discard_wq)
+		goto out_free_data;
 
+	return 0;
+out_free_data:
+	kmem_cache_destroy(ext4_free_data_cachep);
 out_ac_free:
 	kmem_cache_destroy(ext4_ac_cachep);
 out_pa_free:
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index e75b474..d76286b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -79,9 +79,6 @@
 #define MB_DEFAULT_MAX_INODE_PREALLOC	512
 
 struct ext4_free_data {
-	/* this links the free block information from sb_info */
-	struct list_head		efd_list;
-
 	/* this links the free block information from group_info */
 	struct rb_node			efd_node;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-05-26  8:42 ` [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent Wang Jianchao
@ 2021-05-27 19:47   ` Andreas Dilger
  2021-06-23 13:13   ` Theodore Ts'o
  1 sibling, 0 replies; 21+ messages in thread
From: Andreas Dilger @ 2021-05-27 19:47 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Ext4 Developers List,
	Linux Kernel Mailing List, lishujin

[-- Attachment #1: Type: text/plain, Size: 1980 bytes --]


> On May 26, 2021, at 2:42 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
> 
> Get rid of the 'group' parameter of ext4_trim_extent as we can get
> it from the 'e4b'.
> 
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> fs/ext4/mballoc.c | 9 ++++-----
> 1 file changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index a02fadf..d81f1fd22 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -5650,19 +5650,19 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
>  * @sb:		super block for the file system
>  * @start:	starting block of the free extent in the alloc. group
>  * @count:	number of blocks to TRIM
> - * @group:	alloc. group we are working with
>  * @e4b:	ext4 buddy for the group
>  *
>  * Trim "count" blocks starting at "start" in the "group". To assure that no
>  * one will allocate those blocks, mark it as used in buddy bitmap. This must
>  * be called with under the group lock.
>  */
> -static int ext4_trim_extent(struct super_block *sb, int start, int count,
> -			     ext4_group_t group, struct ext4_buddy *e4b)
> +static int ext4_trim_extent(struct super_block *sb,
> +		int start, int count, struct ext4_buddy *e4b)
> __releases(bitlock)
> __acquires(bitlock)
> {
> 	struct ext4_free_extent ex;
> +	ext4_group_t group = e4b->bd_group;
> 	int ret = 0;
> 
> 	trace_ext4_trim_extent(sb, group, start, count);
> @@ -5738,8 +5738,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
> 		next = mb_find_next_bit(bitmap, max + 1, start);
> 
> 		if ((next - start) >= minblocks) {
> -			ret = ext4_trim_extent(sb, start,
> -					       next - start, group, &e4b);
> +			ret = ext4_trim_extent(sb, start, next - start, &e4b);
> 			if (ret && ret != -EOPNOTSUPP)
> 				break;
> 			ret = 0;
> --
> 1.8.3.1


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 2/7] ext4: add new helper interface ext4_try_to_trim_range()
  2021-05-26  8:43   ` [PATCH V2 2/7] ext4: add new helper interface ext4_try_to_trim_range() Wang Jianchao
@ 2021-05-27 19:48     ` Andreas Dilger
  0 siblings, 0 replies; 21+ messages in thread
From: Andreas Dilger @ 2021-05-27 19:48 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Ext4 Developers List, lishujin, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 4367 bytes --]

On May 26, 2021, at 2:43 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
> 
> There is no functional change in this patch but just split the
> codes, which serachs free block and does trim, into a new function
> ext4_try_to_trim_range. This is preparing for the following async
> backgroup discard.
> 
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> fs/ext4/mballoc.c | 102 ++++++++++++++++++++++++++++++------------------------
> 1 file changed, 57 insertions(+), 45 deletions(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index d81f1fd22..f984f15 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -5685,6 +5685,54 @@ static int ext4_trim_extent(struct super_block *sb,
> 	return ret;
> }
> 
> +static int ext4_try_to_trim_range(struct super_block *sb,
> +		struct ext4_buddy *e4b, ext4_grpblk_t start,
> +		ext4_grpblk_t max, ext4_grpblk_t minblocks)
> +{
> +	ext4_grpblk_t next, count, free_count;
> +	void *bitmap;
> +	int ret = 0;
> +
> +	bitmap = e4b->bd_bitmap;
> +	start = (e4b->bd_info->bb_first_free > start) ?
> +		e4b->bd_info->bb_first_free : start;
> +	count = 0;
> +	free_count = 0;
> +
> +	while (start <= max) {
> +		start = mb_find_next_zero_bit(bitmap, max + 1, start);
> +		if (start > max)
> +			break;
> +		next = mb_find_next_bit(bitmap, max + 1, start);
> +
> +		if ((next - start) >= minblocks) {
> +			ret = ext4_trim_extent(sb, start, next - start, e4b);
> +			if (ret && ret != -EOPNOTSUPP)
> +				break;
> +			ret = 0;
> +			count += next - start;
> +		}
> +		free_count += next - start;
> +		start = next + 1;
> +
> +		if (fatal_signal_pending(current)) {
> +			count = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		if (need_resched()) {
> +			ext4_unlock_group(sb, e4b->bd_group);
> +			cond_resched();
> +			ext4_lock_group(sb, e4b->bd_group);
> +		}
> +
> +		if ((e4b->bd_info->bb_free - free_count) < minblocks)
> +			break;
> +	}
> +
> +	return count;
> +}
> +
> /**
>  * ext4_trim_all_free -- function to trim all free space in alloc. group
>  * @sb:			super block for file system
> @@ -5708,10 +5756,8 @@ static int ext4_trim_extent(struct super_block *sb,
> 		   ext4_grpblk_t start, ext4_grpblk_t max,
> 		   ext4_grpblk_t minblocks)
> {
> -	void *bitmap;
> -	ext4_grpblk_t next, count = 0, free_count = 0;
> 	struct ext4_buddy e4b;
> -	int ret = 0;
> +	int ret;
> 
> 	trace_ext4_trim_all_free(sb, group, start, max);
> 
> @@ -5721,57 +5767,23 @@ static int ext4_trim_extent(struct super_block *sb,
> 			     ret, group);
> 		return ret;
> 	}
> -	bitmap = e4b.bd_bitmap;
> 
> 	ext4_lock_group(sb, group);
> -	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
> -	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
> -		goto out;
> -
> -	start = (e4b.bd_info->bb_first_free > start) ?
> -		e4b.bd_info->bb_first_free : start;
> 
> -	while (start <= max) {
> -		start = mb_find_next_zero_bit(bitmap, max + 1, start);
> -		if (start > max)
> -			break;
> -		next = mb_find_next_bit(bitmap, max + 1, start);
> -
> -		if ((next - start) >= minblocks) {
> -			ret = ext4_trim_extent(sb, start, next - start, &e4b);
> -			if (ret && ret != -EOPNOTSUPP)
> -				break;
> -			ret = 0;
> -			count += next - start;
> -		}
> -		free_count += next - start;
> -		start = next + 1;
> -
> -		if (fatal_signal_pending(current)) {
> -			count = -ERESTARTSYS;
> -			break;
> -		}
> -
> -		if (need_resched()) {
> -			ext4_unlock_group(sb, group);
> -			cond_resched();
> -			ext4_lock_group(sb, group);
> -		}
> -
> -		if ((e4b.bd_info->bb_free - free_count) < minblocks)
> -			break;
> +	if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
> +	    minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) {
> +		ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
> +		if (ret >= 0)
> +			EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> +	} else {
> +		ret = 0;
> 	}
> 
> -	if (!ret) {
> -		ret = count;
> -		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
> -	}
> -out:
> 	ext4_unlock_group(sb, group);
> 	ext4_mb_unload_buddy(&e4b);
> 
> 	ext4_debug("trimmed %d blocks in the group %d\n",
> -		count, group);
> +		ret, group);
> 
> 	return ret;
> }
> --
> 1.8.3.1


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 3/7] ext4: remove the repeated comment of ext4_trim_all_free
  2021-05-26  8:43     ` [PATCH V2 3/7] ext4: remove the repeated comment of ext4_trim_all_free Wang Jianchao
@ 2021-05-27 19:49       ` Andreas Dilger
  0 siblings, 0 replies; 21+ messages in thread
From: Andreas Dilger @ 2021-05-27 19:49 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Ext4 Developers List, lishujin, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1285 bytes --]

On May 26, 2021, at 2:43 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
> 
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> fs/ext4/mballoc.c | 7 +------
> 1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index f984f15..85418cf 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -5741,15 +5741,10 @@ static int ext4_try_to_trim_range(struct super_block *sb,
>  * @max:		last group block to examine
>  * @minblocks:		minimum extent block count
>  *
> - * ext4_trim_all_free walks through group's buddy bitmap searching for free
> - * extents. When the free block is found, ext4_trim_extent is called to TRIM
> - * the extent.
> - *
> - *
>  * ext4_trim_all_free walks through group's block bitmap searching for free
>  * extents. When the free extent is found, mark it as used in group buddy
>  * bitmap. Then issue a TRIM command on this extent and free the extent in
> - * the group buddy bitmap. This is done until whole group is scanned.
> + * the group buddy bitmap.
>  */
> static ext4_grpblk_t
> ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> --
> 1.8.3.1


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data
  2021-05-26  8:43       ` [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data Wang Jianchao
@ 2021-05-27 20:09         ` Andreas Dilger
  2021-05-28  3:40           ` Wang Jianchao
  0 siblings, 1 reply; 21+ messages in thread
From: Andreas Dilger @ 2021-05-27 20:09 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Ext4 Developers List, lishujin, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 7604 bytes --]

On May 26, 2021, at 2:43 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
> 
> Split the codes that inserts and merges ext4_free_data structures
> into a new interface ext4_insert_free_data. This is preparing for
> following async background discard.

Thank you for your patch series.  I think this is an important area to
improve, since the current "-o discard" option adds too much overhead
to be really usable in practice.

One problem with tracking the fine-grained freed extents and then using
them directly to submit TRIM requests is that the underlying device may
ignore TRIM requests that are too small.  Submitting the TRIM right
after each transaction commit does not allow much time for freed blocks
to be aggregated (e.g. "rm -r" of a big directory tree), so it would be
better to delay TRIM requests until more freed extents can be merged.
Since most users only run fstrim once a day or every few days, it makes
sense to allow time to merge freed space (tunable, maybe 5-15 minutes).

However, tracking the rbtree for each group may be quite a lot of overhead
if this is kept in memory for minutes or hours, so minimizing the memory
usage to track freed extents is also important.

We discussed on the ext4 developer call today whether it is necessary
to track the fine-grained free extents in memory, or if it would be
better to only track min/max freed blocks within each group?  Depending
on the fragmentation of the free blocks in the group, it may be enough
to just store a single bit in each group (as is done today), and only
clear this when there are blocks freed in the group.

Either way, the improvement would be that the kernel is scheduling
groups to be trimmed, and submitting TRIM requests at a much larger size,
instead of depending on userspace to run fstrim.  This also allows the
fstrim scheduler to decide when the device is less busy and submit more
TRIM requests, and back off when the device is busy.

The other potential improvement is to track the TRIMMED state persistently
in the block groups, so that unmount/remount doesn't result in every group
being trimmed again.  It would be good to refresh and include patches from:

"ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim"
https://patchwork.ozlabs.org/project/linux-ext4/list/?series=184981

and

e2fsprogs: add EXT2_FLAG_BG_WAS_TRIMMED to optimize fstrim
https://patchwork.ozlabs.org/project/linux-ext4/list/?series=179639

along with this series.

> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
> ---
> fs/ext4/mballoc.c | 96 +++++++++++++++++++++++++++++--------------------------
> 1 file changed, 51 insertions(+), 45 deletions(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 85418cf..16f06d2 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -350,6 +350,12 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
> static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
> 						ext4_group_t group);
> static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
> +static inline struct ext4_free_data *efd_entry(struct rb_node *n)
> +{
> +	return rb_entry_safe(n, struct ext4_free_data, efd_node);
> +}
> +static int ext4_insert_free_data(struct ext4_sb_info *sbi,
> +		struct rb_root *root, struct ext4_free_data *nfd);
> 
> /*
>  * The algorithm using this percpu seq counter goes below:
> @@ -5069,28 +5075,53 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
> 	kmem_cache_free(ext4_free_data_cachep, entry);
> }
> 
> +static int ext4_insert_free_data(struct ext4_sb_info *sbi,
> +		struct rb_root *root, struct ext4_free_data *nfd)
> +{
> +	struct rb_node **n = &root->rb_node;
> +	struct rb_node *p = NULL;
> +	struct ext4_free_data *fd;
> +
> +	while (*n) {
> +		p = *n;
> +		fd = rb_entry(p, struct ext4_free_data, efd_node);
> +		if (nfd->efd_start_cluster < fd->efd_start_cluster)
> +			n = &(*n)->rb_left;
> +		else if (nfd->efd_start_cluster >=
> +			 (fd->efd_start_cluster + fd->efd_count))
> +			n = &(*n)->rb_right;
> +		else
> +			return -EINVAL;
> +	}
> +
> +	rb_link_node(&nfd->efd_node, p, n);
> +	rb_insert_color(&nfd->efd_node, root);
> +
> +	/* Now try to see the extent can be merged to left and right */
> +	fd = efd_entry(rb_prev(&nfd->efd_node));
> +	if (fd)
> +		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
> +
> +	fd = efd_entry(rb_next(&nfd->efd_node));
> +	if (fd)
> +		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
> +
> +	return 0;
> +}
> +
> static noinline_for_stack int
> ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
> -		      struct ext4_free_data *new_entry)
> +		      struct ext4_free_data *nfd)
> {
> -	ext4_group_t group = e4b->bd_group;
> -	ext4_grpblk_t cluster;
> -	ext4_grpblk_t clusters = new_entry->efd_count;
> -	struct ext4_free_data *entry;
> 	struct ext4_group_info *db = e4b->bd_info;
> 	struct super_block *sb = e4b->bd_sb;
> 	struct ext4_sb_info *sbi = EXT4_SB(sb);
> -	struct rb_node **n = &db->bb_free_root.rb_node, *node;
> -	struct rb_node *parent = NULL, *new_node;
> 
> 	BUG_ON(!ext4_handle_valid(handle));
> 	BUG_ON(e4b->bd_bitmap_page == NULL);
> 	BUG_ON(e4b->bd_buddy_page == NULL);
> 
> -	new_node = &new_entry->efd_node;
> -	cluster = new_entry->efd_start_cluster;
> -
> -	if (!*n) {
> +	if (!db->bb_free_root.rb_node) {
> 		/* first free block exent. We need to
> 		   protect buddy cache from being freed,
> 		 * otherwise we'll refresh it from
> @@ -5099,44 +5130,19 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
> 		get_page(e4b->bd_buddy_page);
> 		get_page(e4b->bd_bitmap_page);
> 	}
> -	while (*n) {
> -		parent = *n;
> -		entry = rb_entry(parent, struct ext4_free_data, efd_node);
> -		if (cluster < entry->efd_start_cluster)
> -			n = &(*n)->rb_left;
> -		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
> -			n = &(*n)->rb_right;
> -		else {
> -			ext4_grp_locked_error(sb, group, 0,
> -				ext4_group_first_block_no(sb, group) +
> -				EXT4_C2B(sbi, cluster),
> -				"Block already on to-be-freed list");
> -			kmem_cache_free(ext4_free_data_cachep, new_entry);
> -			return 0;
> -		}
> -	}
> -
> -	rb_link_node(new_node, parent, n);
> -	rb_insert_color(new_node, &db->bb_free_root);
> -
> -	/* Now try to see the extent can be merged to left and right */
> -	node = rb_prev(new_node);
> -	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, efd_node);
> -		ext4_try_merge_freed_extent(sbi, entry, new_entry,
> -					    &(db->bb_free_root));
> -	}
> 
> -	node = rb_next(new_node);
> -	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, efd_node);
> -		ext4_try_merge_freed_extent(sbi, entry, new_entry,
> -					    &(db->bb_free_root));
> +	if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
> +		ext4_grp_locked_error(sb, e4b->bd_group, 0,
> +				ext4_group_first_block_no(sb, e4b->bd_group) +
> +				EXT4_C2B(sbi, nfd->efd_start_cluster),
> +				"Block already on to-be-freed list");
> +		kmem_cache_free(ext4_free_data_cachep, nfd);
> +		return 0;
> 	}
> 
> 	spin_lock(&sbi->s_md_lock);
> -	list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
> -	sbi->s_mb_free_pending += clusters;
> +	list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
> +	sbi->s_mb_free_pending += nfd->efd_count;
> 	spin_unlock(&sbi->s_md_lock);
> 	return 0;
> }
> --
> 1.8.3.1
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
  2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
@ 2021-05-27 20:18             ` Andreas Dilger
  2021-05-28  3:06               ` Wang Jianchao
  2021-06-22  0:55             ` Josh Triplett
  2023-09-06  0:11             ` Sarthak Kukreti
  2 siblings, 1 reply; 21+ messages in thread
From: Andreas Dilger @ 2021-05-27 20:18 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Ext4 Developers List, lishujin, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 9460 bytes --]

On May 26, 2021, at 2:44 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
> 
> Right now, discard is issued and waited to be completed in jbd2
> commit kthread context after the logs are committed. When large
> amount of files are deleted and discard is flooding, jbd2 commit
> kthread can be blocked for long time. Then all of the metadata
> operations can be blocked to wait the log space.
> 
> One case is the page fault path with read mm->mmap_sem held, which
> wants to update the file time but has to wait for the log space.
> When other threads in the task wants to do mmap, then write mmap_sem
> is blocked. Finally all of the following read mmap_sem requirements
> are blocked, even the ps command which need to read the /proc/pid/
> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
> used to be blocked for 5 mins.
> 
> This patch frees the blocks back to buddy after commit and then do
> discard in a async kworker context in fstrim fashion, namely,
> - mark blocks to be discarded as used if they have not been allocated
> - do discard
> - mark them free
> After this, jbd2 commit kthread won't be blocked any more by discard
> and we won't get NOSPC even if the discard is slow or throttled.

I definitely agree that sharing the existing fstrim functionality makes
the most sense here.  Some comments inline on the implementation.

> Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
> Suggested-by: Theodore Ts'o <tytso@mit.edu>
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
> ---
> fs/ext4/ext4.h    |   2 +
> fs/ext4/mballoc.c | 162 +++++++++++++++++++++++++++++++++---------------------
> fs/ext4/mballoc.h |   3 -
> 3 files changed, 101 insertions(+), 66 deletions(-)
> 
> @@ -3024,30 +3039,77 @@ static inline int ext4_issue_discard(struct super_block *sb,
> 		return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
> }
> 
> -static void ext4_free_data_in_buddy(struct super_block *sb,
> -				    struct ext4_free_data *entry)
> +static void ext4_discard_work(struct work_struct *work)
> {
> +	struct ext4_sb_info *sbi = container_of(work,
> +			struct ext4_sb_info, s_discard_work);
> +	struct super_block *sb = sbi->s_sb;
> +	ext4_group_t ngroups = ext4_get_groups_count(sb);
> +	struct ext4_group_info *grp;
> +	struct ext4_free_data *fd, *nfd;
> 	struct ext4_buddy e4b;
> -	struct ext4_group_info *db;
> -	int err, count = 0, count2 = 0;
> +	int i, err;
> +
> +	for (i = 0; i < ngroups; i++) {
> +		grp = ext4_get_group_info(sb, i);
> +		if (RB_EMPTY_ROOT(&grp->bb_discard_root))
> +			continue;

For large filesystems there may be millions of block groups, so it
seems inefficient to scan all of the groups each time the work queue
is run.  Having a list of block group numbers, or bitmap/rbtree/xarray
of the group numbers that need to be trimmed may be more efficient?

Most of the complexity in the rest of the patch goes away if the trim
tracking is only done on a whole-group basis (min/max or just a single
bit per group).

Cheers, Andreas

> -	mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
> -		 entry->efd_count, entry->efd_group, entry);
> +		err = ext4_mb_load_buddy(sb, i, &e4b);
> +		if (err) {
> +			ext4_warning(sb, "Error %d loading buddy information for %u",
> +					err, i);
> +			continue;
> +		}
> 
> -	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
> -	/* we expect to find existing buddy because it's pinned */
> -	BUG_ON(err != 0);
> +		ext4_lock_group(sb, i);
> +		rbtree_postorder_for_each_entry_safe(fd, nfd,
> +				&grp->bb_discard_root, efd_node) {
> +			rb_erase(&fd->efd_node, &grp->bb_discard_root);
> +			/*
> +			 * If filesystem is umounting, give up the discard
> +			 */
> +			if (sb->s_flags & SB_ACTIVE)
> +				ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
> +						fd->efd_start_cluster + fd->efd_count - 1, 1);
> +			kmem_cache_free(ext4_free_data_cachep, fd);
> +		}
> +		ext4_unlock_group(sb, i);
> +		ext4_mb_unload_buddy(&e4b);
> +	}
> +}
> 
> -	atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
> +static int ext4_free_data_in_buddy(struct super_block *sb,
> +		ext4_group_t group, tid_t commit_tid)
> +{
> +	struct ext4_buddy e4b;
> +	struct ext4_group_info *db;
> +	struct ext4_free_data *fd, *nfd;
> +	int cnt, nr_fd;
> 
> +	/* we expect to find existing buddy because it's pinned */
> +	BUG_ON(ext4_mb_load_buddy(sb, group, &e4b));
> 	db = e4b.bd_info;
> -	/* there are blocks to put in buddy to make them really free */
> -	count += entry->efd_count;
> -	count2++;
> -	ext4_lock_group(sb, entry->efd_group);
> -	/* Take it out of per group rb tree */
> -	rb_erase(&entry->efd_node, &(db->bb_free_root));
> -	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
> +	cnt = 0;
> +	nr_fd = 0;
> +	ext4_lock_group(sb, group);
> +	rbtree_postorder_for_each_entry_safe(fd, nfd,
> +			&db->bb_free_root, efd_node) {
> +		if (fd->efd_tid != commit_tid)
> +			continue;
> +
> +		mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
> +			 fd->efd_count, fd->efd_group, fd);
> +		atomic_sub(fd->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
> +		cnt += fd->efd_count;
> +		nr_fd++;
> +		rb_erase(&fd->efd_node, &db->bb_free_root);
> +		mb_free_blocks(NULL, &e4b, fd->efd_start_cluster, fd->efd_count);
> +		if (test_opt(sb, DISCARD))
> +			ext4_insert_free_data(&db->bb_discard_root, fd);
> +		else
> +			kmem_cache_free(ext4_free_data_cachep, fd);
> +	}
> 
> 	/*
> 	 * Clear the trimmed flag for the group so that the next
> @@ -3055,22 +3117,22 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
> 	 * If the volume is mounted with -o discard, online discard
> 	 * is supported and the free blocks will be trimmed online.
> 	 */
> -	if (!test_opt(sb, DISCARD))
> +	if (!test_opt(sb, DISCARD) && cnt)
> 		EXT4_MB_GRP_CLEAR_TRIMMED(db);
> 
> -	if (!db->bb_free_root.rb_node) {
> +	if (RB_EMPTY_ROOT(&db->bb_free_root) && cnt) {
> 		/* No more items in the per group rb tree
> 		 * balance refcounts from ext4_mb_free_metadata()
> 		 */
> 		put_page(e4b.bd_buddy_page);
> 		put_page(e4b.bd_bitmap_page);
> 	}
> -	ext4_unlock_group(sb, entry->efd_group);
> -	kmem_cache_free(ext4_free_data_cachep, entry);
> +	ext4_unlock_group(sb, group);
> 	ext4_mb_unload_buddy(&e4b);
> 
> -	mb_debug(sb, "freed %d blocks in %d structures\n", count,
> -		 count2);
> +	mb_debug(sb, "freed %d blocks in %d structures\n", cnt, nr_fd);
> +
> +	return nr_fd;
> }
> 
> /*
> @@ -3081,52 +3143,21 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
> {
> 	struct ext4_sb_info *sbi = EXT4_SB(sb);
> 	ext4_group_t ngroups = ext4_get_groups_count(sb);
> -	struct ext4_free_data *fd, *nfd;
> 	struct ext4_group_info *grp;
> -	struct bio *discard_bio = NULL;
> -	struct list_head freed_data_list;
> -	int err, i;
> +	int i, nr;
> 
> 	if (!atomic_read(&sbi->s_mb_free_pending))
> 		return;
> 
> -	INIT_LIST_HEAD(&freed_data_list);
> -	for (i = 0; i < ngroups; i++) {
> +	for (i = 0, nr = 0; i < ngroups; i++) {
> 		grp = ext4_get_group_info(sb, i);
> -		ext4_lock_group(sb, i);
> -		rbtree_postorder_for_each_entry_safe(fd, nfd,
> -				&grp->bb_free_root, efd_node) {
> -			if (fd->efd_tid != commit_tid)
> -				continue;
> -			INIT_LIST_HEAD(&fd->efd_list);
> -			list_add_tail(&fd->efd_list, &freed_data_list);
> -		}
> -		ext4_unlock_group(sb, i);
> -	}
> -
> -	if (test_opt(sb, DISCARD)) {
> -		list_for_each_entry(fd, &freed_data_list, efd_list) {
> -			err = ext4_issue_discard(sb, fd->efd_group,
> -						 fd->efd_start_cluster,
> -						 fd->efd_count,
> -						 &discard_bio);
> -			if (err && err != -EOPNOTSUPP) {
> -				ext4_msg(sb, KERN_WARNING, "discard request in"
> -					 " group:%d block:%d count:%d failed"
> -					 " with %d", fd->efd_group,
> -					 fd->efd_start_cluster, fd->efd_count, err);
> -			} else if (err == -EOPNOTSUPP)
> -				break;
> -		}
> -
> -		if (discard_bio) {
> -			submit_bio_wait(discard_bio);
> -			bio_put(discard_bio);
> -		}
> +		if (RB_EMPTY_ROOT(&grp->bb_free_root))
> +			continue;
> +		nr += ext4_free_data_in_buddy(sb, i, commit_tid);
> 	}
> 
> -	list_for_each_entry_safe(fd, nfd, &freed_data_list, efd_list)
> -		ext4_free_data_in_buddy(sb, fd);
> +	if (nr && test_opt(sb, DISCARD))
> +		queue_work(ext4_discard_wq, &sbi->s_discard_work);
> }
> 
> int __init ext4_init_mballoc(void)
> @@ -3146,8 +3177,13 @@ int __init ext4_init_mballoc(void)
> 	if (ext4_free_data_cachep == NULL)
> 		goto out_ac_free;
> 
> -	return 0;
> +	ext4_discard_wq = alloc_workqueue("ext4discard", WQ_UNBOUND, 0);
> +	if (!ext4_discard_wq)
> +		goto out_free_data;
> 
> +	return 0;
> +out_free_data:
> +	kmem_cache_destroy(ext4_free_data_cachep);
> out_ac_free:
> 	kmem_cache_destroy(ext4_ac_cachep);
> out_pa_free:
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index e75b474..d76286b 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -79,9 +79,6 @@
> #define MB_DEFAULT_MAX_INODE_PREALLOC	512
> 
> struct ext4_free_data {
> -	/* this links the free block information from sb_info */
> -	struct list_head		efd_list;
> -
> 	/* this links the free block information from group_info */
> 	struct rb_node			efd_node;
> 
> --
> 1.8.3.1


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
  2021-05-27 20:18             ` Andreas Dilger
@ 2021-05-28  3:06               ` Wang Jianchao
  0 siblings, 0 replies; 21+ messages in thread
From: Wang Jianchao @ 2021-05-28  3:06 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Theodore Ts'o, Ext4 Developers List, lishujin, linux-fsdevel



On 2021/5/28 4:18 AM, Andreas Dilger wrote:
> On May 26, 2021, at 2:44 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
>>
>> Right now, discard is issued and waited to be completed in jbd2
>> commit kthread context after the logs are committed. When large
>> amount of files are deleted and discard is flooding, jbd2 commit
>> kthread can be blocked for long time. Then all of the metadata
>> operations can be blocked to wait the log space.
>>
>> One case is the page fault path with read mm->mmap_sem held, which
>> wants to update the file time but has to wait for the log space.
>> When other threads in the task wants to do mmap, then write mmap_sem
>> is blocked. Finally all of the following read mmap_sem requirements
>> are blocked, even the ps command which need to read the /proc/pid/
>> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
>> used to be blocked for 5 mins.
>>
>> This patch frees the blocks back to buddy after commit and then do
>> discard in a async kworker context in fstrim fashion, namely,
>> - mark blocks to be discarded as used if they have not been allocated
>> - do discard
>> - mark them free
>> After this, jbd2 commit kthread won't be blocked any more by discard
>> and we won't get NOSPC even if the discard is slow or throttled.
> 
> I definitely agree that sharing the existing fstrim functionality makes
> the most sense here.  Some comments inline on the implementation.
> 
>> Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
>> Suggested-by: Theodore Ts'o <tytso@mit.edu>
>> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
>> ---
>> fs/ext4/ext4.h    |   2 +
>> fs/ext4/mballoc.c | 162 +++++++++++++++++++++++++++++++++---------------------
>> fs/ext4/mballoc.h |   3 -
>> 3 files changed, 101 insertions(+), 66 deletions(-)
>>
>> @@ -3024,30 +3039,77 @@ static inline int ext4_issue_discard(struct super_block *sb,
>> 		return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
>> }
>>
>> -static void ext4_free_data_in_buddy(struct super_block *sb,
>> -				    struct ext4_free_data *entry)
>> +static void ext4_discard_work(struct work_struct *work)
>> {
>> +	struct ext4_sb_info *sbi = container_of(work,
>> +			struct ext4_sb_info, s_discard_work);
>> +	struct super_block *sb = sbi->s_sb;
>> +	ext4_group_t ngroups = ext4_get_groups_count(sb);
>> +	struct ext4_group_info *grp;
>> +	struct ext4_free_data *fd, *nfd;
>> 	struct ext4_buddy e4b;
>> -	struct ext4_group_info *db;
>> -	int err, count = 0, count2 = 0;
>> +	int i, err;
>> +
>> +	for (i = 0; i < ngroups; i++) {
>> +		grp = ext4_get_group_info(sb, i);
>> +		if (RB_EMPTY_ROOT(&grp->bb_discard_root))
>> +			continue;
> 
> For large filesystems there may be millions of block groups, so it
> seems inefficient to scan all of the groups each time the work queue

Yes it seems to be. At the moment I cooked the patch, I thought kwork is
running on background, it should not be a big deal. 

> is run.  Having a list of block group numbers, or bitmap/rbtree/xarray
> of the group numbers that need to be trimmed may be more efficient?

Maybe we can use a bitmap to record the bgs that need to be trimed

Best regards
Jianchao

> 
> Most of the complexity in the rest of the patch goes away if the trim
> tracking is only done on a whole-group basis (min/max or just a single
> bit per group).
> 
> Cheers, Andreas
> 
>> -	mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",

> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data
  2021-05-27 20:09         ` Andreas Dilger
@ 2021-05-28  3:40           ` Wang Jianchao
  0 siblings, 0 replies; 21+ messages in thread
From: Wang Jianchao @ 2021-05-28  3:40 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Theodore Ts'o, Ext4 Developers List, lishujin, linux-fsdevel



On 2021/5/28 4:09 AM, Andreas Dilger wrote:
> On May 26, 2021, at 2:43 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
>>
>> Split the codes that inserts and merges ext4_free_data structures
>> into a new interface ext4_insert_free_data. This is preparing for
>> following async background discard.
> 
> Thank you for your patch series.  I think this is an important area to
> improve, since the current "-o discard" option adds too much overhead
> to be really usable in practice.

Yes, indeed
The discard can help to free unusable spaces back to storage cluster.
But do discard after every commit can be disaster,
 - the jbd2 commit kthread can be blocked for long time sometimes, and
   then all of the metadata modify operations are blocked due to no log
   space
 - the flooding discard can saturate the storage backend and then the
   real write operations are blocked, especially the jbd2 log records

Even in the system with this patch, we can still observed the log write IO
can be blocked by the discard T_T...

> 
> One problem with tracking the fine-grained freed extents and then using
> them directly to submit TRIM requests is that the underlying device may
> ignore TRIM requests that are too small.  Submitting the TRIM right
> after each transaction commit does not allow much time for freed blocks
> to be aggregated (e.g. "rm -r" of a big directory tree), so it would be
> better to delay TRIM requests until more freed extents can be merged.
> Since most users only run fstrim once a day or every few days, it makes
> sense to allow time to merge freed space (tunable, maybe 5-15 minutes).
> 
> However, tracking the rbtree for each group may be quite a lot of overhead
> if this is kept in memory for minutes or hours, so minimizing the memory
> usage to track freed extents is also important.
> 
> We discussed on the ext4 developer call today whether it is necessary
> to track the fine-grained free extents in memory, or if it would be
> better to only track min/max freed blocks within each group?  Depending
> on the fragmentation of the free blocks in the group, it may be enough
> to just store a single bit in each group (as is done today), and only
> clear this when there are blocks freed in the group.
> 
> Either way, the improvement would be that the kernel is scheduling
> groups to be trimmed, and submitting TRIM requests at a much larger size,
> instead of depending on userspace to run fstrim.  This also allows the
> fstrim scheduler to decide when the device is less busy and submit more
> TRIM requests, and back off when the device is busy.

Schedule a background trim task in kernel when the storage is not so busy 
and pick up a block group that that has bigger enough free blocks.
This sounds fair. 

> 
> The other potential improvement is to track the TRIMMED state persistently
> in the block groups, so that unmount/remount doesn't result in every group
> being trimmed again.  It would be good to refresh and include patches from:
> 
> "ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim"
> https://patchwork.ozlabs.org/project/linux-ext4/list/?series=184981
> 
> and
> 
> e2fsprogs: add EXT2_FLAG_BG_WAS_TRIMMED to optimize fstrim
> https://patchwork.ozlabs.org/project/linux-ext4/list/?series=179639
> 
> along with this series.
> 

Yes, thanks a million

Best regard
Jianchao

>> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

> 
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
  2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
  2021-05-27 20:18             ` Andreas Dilger
@ 2021-06-22  0:55             ` Josh Triplett
  2023-09-06  0:11             ` Sarthak Kukreti
  2 siblings, 0 replies; 21+ messages in thread
From: Josh Triplett @ 2021-06-22  0:55 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Andreas Dilger, linux-ext4, linux-kernel, lishujin

On Wed, May 26, 2021 at 04:44:06PM +0800, Wang Jianchao wrote:
> Right now, discard is issued and waited to be completed in jbd2
> commit kthread context after the logs are committed. When large
> amount of files are deleted and discard is flooding, jbd2 commit
> kthread can be blocked for long time. Then all of the metadata
> operations can be blocked to wait the log space.
> 
> One case is the page fault path with read mm->mmap_sem held, which
> wants to update the file time but has to wait for the log space.
> When other threads in the task wants to do mmap, then write mmap_sem
> is blocked. Finally all of the following read mmap_sem requirements
> are blocked, even the ps command which need to read the /proc/pid/
> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
> used to be blocked for 5 mins.
> 
> This patch frees the blocks back to buddy after commit and then do
> discard in a async kworker context in fstrim fashion, namely,
>  - mark blocks to be discarded as used if they have not been allocated
>  - do discard
>  - mark them free
> After this, jbd2 commit kthread won't be blocked any more by discard
> and we won't get NOSPC even if the discard is slow or throttled.
> 
> Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
> Suggested-by: Theodore Ts'o <tytso@mit.edu>
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

I tested this patch series atop 5.13.0-rc7 (which took a bit of
rebasing), and it works quite well. I tested ext4 on dm-crypt on a
laptop NVME drive, using this test case:

$ mkdir testdir
$ time python3 -c 'for i in range(1000000): open(f"testdir/{i}", "wb").write(b"test data")'
$ time rm -r testdir

I tried this with and without discard enabled.

Without this patch, if discard is enabled, the rm takes many minutes,
and stalls out the rest of the system.

With this patch, the rm takes the same amount of time (~17s) whether
discard is enabled or disabled, and the rest of the system continues to
function.

Tested-by: Josh Triplett <josh@joshtriplett.org>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 5/7] ext4: get buddy cache after insert successfully
  2021-05-26  8:43         ` [PATCH V2 5/7] ext4: get buddy cache after insert successfully Wang Jianchao
@ 2021-06-23  3:06           ` Theodore Ts'o
  0 siblings, 0 replies; 21+ messages in thread
From: Theodore Ts'o @ 2021-06-23  3:06 UTC (permalink / raw)
  To: Wang Jianchao; +Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin

On Wed, May 26, 2021 at 04:43:43PM +0800, Wang Jianchao wrote:
> The getting of bd_bitmap_page and bd_buddy_page should be done
> after insert the ext4_free_data successfully. Otherwise, nobody
> can put them.
> 
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

This looks like it's a bug fix for patch 4/7 --- is that correct?  If
so, maybe we should fold this into the previous patch?

Thanks,

						- Ted

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-05-26  8:42 ` [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent Wang Jianchao
  2021-05-27 19:47   ` Andreas Dilger
@ 2021-06-23 13:13   ` Theodore Ts'o
  2021-06-23 19:49     ` Theodore Ts'o
  1 sibling, 1 reply; 21+ messages in thread
From: Theodore Ts'o @ 2021-06-23 13:13 UTC (permalink / raw)
  To: Wang Jianchao; +Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin

Hi Jianchao, 

FYI, this patch series has confliects with these patches, which landed
in 5.13-rc1:

196e402adf2e - ext4: improve cr 0 / cr 1 group scanning
4b68f6df1059 - ext4: add MB_NUM_ORDERS macro
a6c75eaf1103 - ext4: add mballoc stats proc file
67d251860461 - ext4: drop s_mb_bal_lock and convert protected fields to atomic

The conflicts were relatively minor, but the obvious fix-ups resulted
in a large number of crashes caused by various stress tests, such as
generic/068 and generic/204.  I'm currently investigating to see what
I might have messed up when I tried applying these patches, as well as
running your patch set applied against 5.12 to make sure the problems
weren't introduced by the patch set itself.

					- Ted

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-06-23 13:13   ` Theodore Ts'o
@ 2021-06-23 19:49     ` Theodore Ts'o
  2021-07-02 10:27       ` Josh Triplett
  2021-07-05 11:28       ` Wang Jianchao
  0 siblings, 2 replies; 21+ messages in thread
From: Theodore Ts'o @ 2021-06-23 19:49 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin, Josh Triplett

CC'ed Josh since he was testing this patchset.

On Wed, Jun 23, 2021 at 09:13:42AM -0400, Theodore Ts'o wrote:
> 
> The conflicts were relatively minor, but the obvious fix-ups resulted
> in a large number of crashes caused by various stress tests, such as
> generic/068 and generic/204.  I'm currently investigating to see what
> I might have messed up when I tried applying these patches, as well as
> running your patch set applied against 5.12 to make sure the problems
> weren't introduced by the patch set itself.

I applied this patch set on top of 5.12, where it applied w/o any
patch conflicts, and I'm still seeing large number of crashes[1].  The
crashes are in a wide variety of tests, and many of the stack traces
involve the ext4_discard_work workqueue.

[1] I ran the test via "gce-xfstests ltm -c ext4/all -g auto --kernel
gs://<bucket>/kernel.deb", but almost all of the file system configs
were crashing.  So "kvm-xfstests -c ext4/4k -g auto" should be
sufficient to reproduce the problem.  kvm-xfstests can be found at
https://github.com/tytso/xfstests-bld.

I don't have time to debug this more closely right now, but at a rough
guess is that workqueue isn't properly getting shutdown during a file
system unmount, and this is causing all sorts of mischief.

       	  	     	      	 	     - Ted


Jun 23 09:04:46 debian kernel: [    0.000000] Linux version 5.12.0-xfstests-00007-g1d72c24b1506 (tytso@cwcc) (gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2) #188 SMP Wed Jun 23 08:59:01 EDT 2021
Jun 23 09:04:46 debian kernel: [    0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.12.0-xfstests-00007-g1d72c24b1506 root=UUID=d7961c9f-8636-4194-bcb7-b7861b5a470f ro console=ttyS0,115200 earlyprintk=ttyS0,115200 elevator=noop console=ttyS0 net.ifnames=0 biosdevname=0 mem=7680M fstestcfg=ext4/4k fstestset=-g,auto fstestexc= fstestopt=aex fstesttyp=ext4 fstestapi=1.5 fsteststr= nfssrv= orig_cmdline=LS1pbnN0YW5jZS1uYW1lIHhmc3Rlc3RzLWx0bS0yMDIxMDYyMzA5MDI0Ny1hYSAtLWdjZS16b25lIHVzLXdlc3QxLWEgLS1ncy1idWNrZXQgZ2NlLXhmc3Rlc3RzIC0taW1hZ2UtcHJvamVjdCBnY2UteGZzdGVzdHMgLS1rZXJuZWwgZ3M6Ly9nY2UteGZzdGVzdHMva2VybmVsLmRlYiAtLWJ1Y2tldC1zdWJkaXIgcmVzdWx0cyAtLW5vLWVtYWlsIC1jIGV4dDQvNGsgZnVsbA==
	...
[  954.782827] run fstests generic/041 at 2021-06-23 09:20:30
Jun 23 09:20:30 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/041.
Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.288215] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.879898] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090048] EXT4-fs (dm-2): recovery complete
Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090776] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
[  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
[  985.890856] #PF: supervisor read access in kernel mode
[  985.896241] #PF: error_code(0x0000) - not-present page
[  985.901665] PGD 0 P4D 0 
[  985.904648] Oops: 0000 [#1] SMP PTI
[  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
[  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  985.927582] Workqueue: ext4discard ext4_discard_work
[  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
[  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
[  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
[  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
[  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
[  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
[  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
[  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
[  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
[  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
[  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  986.029742] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  986.037346] Call Trace:
[  986.039958]  ? lock_acquire.part.0+0x5c/0x120
[  986.044430]  ? process_one_work+0x1fb/0x590
[  986.048844]  ? lock_is_held_type+0x98/0x100
[  986.053742]  process_one_work+0x27b/0x590
[  986.058146]  worker_thread+0x4e/0x300
[  986.061953]  ? process_one_work+0x590/0x590
[  986.066255]  kthread+0x13a/0x150
[  986.069770]  ? __kthread_bind_mask+0x60/0x60
[  986.074420]  ret_from_fork+0x22/0x30
[  986.078269] CR2: 0000000000000040
[  986.081703] ---[ end trace 01e941885177e812 ]---
[  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
[  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
[  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
[  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
[  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
[  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
[  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
[  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
[  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
[  986.161707] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
[  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
[  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
[  986.208016] INFO: lockdep is turned off.
[  986.212055] irq event stamp: 559784
[  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
[  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
[  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
[  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
[  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
[  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  986.271275] Workqueue: ext4discard ext4_discard_work
[  986.276382] Call Trace:
[  986.279012]  dump_stack+0x6d/0x89
[  986.283240]  ___might_sleep.cold+0xa6/0xb6
[  986.287489]  exit_signals+0x1c/0x2c0
[  986.291417]  do_exit+0xa6/0x3c0
[  986.294672]  rewind_stack_do_exit+0x17/0x20
[  986.299092] RIP: 0000:0x0
[  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
[  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
[  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.890856] #PF: supervisor read access in kernel mode
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.896241] #PF: error_code(0x0000) - not-present page
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.901665] PGD 0 P4D 0 
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.904648] Oops: 0000 [#1] SMP PTI
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.927582] Workqueue: ext4discard ext4_discard_work
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.029742] DR3: 0000000000000000 DR6: 0000[  986.603186] run fstests generic/043 at 2021-06-23 09:21:02
0000fffe0ff0 DR7: 0000000000000400
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.037346] Call Trace:
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.039958]  ? lock_acquire.part.0+0x5c/0x120
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.044430]  ? process_one_work+0x1fb/0x590
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.048844]  ? lock_is_held_type+0x98/0x100
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.053742]  process_one_work+0x27b/0x590
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.058146]  worker_thread+0x4e/0x300
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.061953]  ? process_one_work+0x590/0x590
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.066255]  kthread+0x13a/0x150
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.069770]  ? __kthread_bind_mask+0x60/0x60
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.074420]  ret_from_fork+0x22/0x30
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.078269] CR2: 0000000000000040
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.081703] ---[ end trace 01e941885177e812 ]---
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.161707] C[  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
S:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.208016] INFO: lockdep is turned off.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.212055] irq event stamp: 559784
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.271275] Workqueue: ext4discard ext4_discard_work
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.276382] Call Trace:
Jun 23 09:21:02 xfstests-ltm-20210623090247-a[  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
a kernel: [  986.279012]  dump_stack+0x6d/0x89
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.283240]  ___might_sleep.cold+0xa6/0xb6
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.287489]  exit_signals+0x1c/0x2c0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.291417]  do_exit+0xa6/0x3c0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.294672]  rewind_stack_do_exit+0x17/0x20
Jun 23 0[  987.102134] EXT4-fs (dm-1): shut down requested (1)
9:21:02 xfstests[  987.107927] Aborting journal on device dm-1-8.
-ltm-20210623090247-aa kernel: [  986.299092] RIP: 0000:0x0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: fstests-generic-041.scope: Succeeded.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdb.mount: Succeeded.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdb.mount: Succeeded.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/043.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.102134] EXT4-fs (dm-1)[  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
: shut down requested (1)
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.107927] Aborting journal on device dm-1-8.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
[  999.135257] EXT4-fs (dm-1): shut down requested (2)
[  999.140764] Aborting journal on device dm-1-8.
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.135257] EXT4-fs (dm-1): shut down requested (2)
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.140764] Aborting journal on device dm-1-8.
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
[  999.206122] EXT4-fs (dm-1): recovery complete

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-06-23 19:49     ` Theodore Ts'o
@ 2021-07-02 10:27       ` Josh Triplett
  2021-07-05 11:28       ` Wang Jianchao
  1 sibling, 0 replies; 21+ messages in thread
From: Josh Triplett @ 2021-07-02 10:27 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Wang Jianchao, Andreas Dilger, linux-ext4, linux-kernel, lishujin

On Wed, Jun 23, 2021 at 03:49:52PM -0400, Theodore Ts'o wrote:
> CC'ed Josh since he was testing this patchset.
> 
> On Wed, Jun 23, 2021 at 09:13:42AM -0400, Theodore Ts'o wrote:
> > 
> > The conflicts were relatively minor, but the obvious fix-ups resulted
> > in a large number of crashes caused by various stress tests, such as
> > generic/068 and generic/204.  I'm currently investigating to see what
> > I might have messed up when I tried applying these patches, as well as
> > running your patch set applied against 5.12 to make sure the problems
> > weren't introduced by the patch set itself.
> 
> I applied this patch set on top of 5.12, where it applied w/o any
> patch conflicts, and I'm still seeing large number of crashes[1].  The
> crashes are in a wide variety of tests, and many of the stack traces
> involve the ext4_discard_work workqueue.

I applied this on top of a96bfed64c8986d6404e553f18203cae1f5ac7e6, which
required some fixups. I didn't try to run any kind of intensive
stress-tests, but I've run the deletion performance test I mentioned, as
well as various software compiles and other miscellaneous work. I
haven't run into any apparent issues.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-06-23 19:49     ` Theodore Ts'o
  2021-07-02 10:27       ` Josh Triplett
@ 2021-07-05 11:28       ` Wang Jianchao
  1 sibling, 0 replies; 21+ messages in thread
From: Wang Jianchao @ 2021-07-05 11:28 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin, Josh Triplett

Hi Ted

Thanks so much for you
I've be recently busy on some other works and so sorry for missing the mails.
I will debug and rework this patch when the project on hand is finished

Best regards
Jianchao

On 2021/6/24 3:49 AM, Theodore Ts'o wrote:
> CC'ed Josh since he was testing this patchset.
> 
> On Wed, Jun 23, 2021 at 09:13:42AM -0400, Theodore Ts'o wrote:
>>
>> The conflicts were relatively minor, but the obvious fix-ups resulted
>> in a large number of crashes caused by various stress tests, such as
>> generic/068 and generic/204.  I'm currently investigating to see what
>> I might have messed up when I tried applying these patches, as well as
>> running your patch set applied against 5.12 to make sure the problems
>> weren't introduced by the patch set itself.
> 
> I applied this patch set on top of 5.12, where it applied w/o any
> patch conflicts, and I'm still seeing large number of crashes[1].  The
> crashes are in a wide variety of tests, and many of the stack traces
> involve the ext4_discard_work workqueue.
> 
> [1] I ran the test via "gce-xfstests ltm -c ext4/all -g auto --kernel
> gs://<bucket>/kernel.deb", but almost all of the file system configs
> were crashing.  So "kvm-xfstests -c ext4/4k -g auto" should be
> sufficient to reproduce the problem.  kvm-xfstests can be found at
> https://github.com/tytso/xfstests-bld.
> 
> I don't have time to debug this more closely right now, but at a rough
> guess is that workqueue isn't properly getting shutdown during a file
> system unmount, and this is causing all sorts of mischief.
> 
>        	  	     	      	 	     - Ted
> 
> 
> Jun 23 09:04:46 debian kernel: [    0.000000] Linux version 5.12.0-xfstests-00007-g1d72c24b1506 (tytso@cwcc) (gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2) #188 SMP Wed Jun 23 08:59:01 EDT 2021
> Jun 23 09:04:46 debian kernel: [    0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.12.0-xfstests-00007-g1d72c24b1506 root=UUID=d7961c9f-8636-4194-bcb7-b7861b5a470f ro console=ttyS0,115200 earlyprintk=ttyS0,115200 elevator=noop console=ttyS0 net.ifnames=0 biosdevname=0 mem=7680M fstestcfg=ext4/4k fstestset=-g,auto fstestexc= fstestopt=aex fstesttyp=ext4 fstestapi=1.5 fsteststr= nfssrv= orig_cmdline=LS1pbnN0YW5jZS1uYW1lIHhmc3Rlc3RzLWx0bS0yMDIxMDYyMzA5MDI0Ny1hYSAtLWdjZS16b25lIHVzLXdlc3QxLWEgLS1ncy1idWNrZXQgZ2NlLXhmc3Rlc3RzIC0taW1hZ2UtcHJvamVjdCBnY2UteGZzdGVzdHMgLS1rZXJuZWwgZ3M6Ly9nY2UteGZzdGVzdHMva2VybmVsLmRlYiAtLWJ1Y2tldC1zdWJkaXIgcmVzdWx0cyAtLW5vLWVtYWlsIC1jIGV4dDQvNGsgZnVsbA==
> 	...
> [  954.782827] run fstests generic/041 at 2021-06-23 09:20:30
> Jun 23 09:20:30 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/041.
> Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.288215] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.879898] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090048] EXT4-fs (dm-2): recovery complete
> Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090776] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> [  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
> [  985.890856] #PF: supervisor read access in kernel mode
> [  985.896241] #PF: error_code(0x0000) - not-present page
> [  985.901665] PGD 0 P4D 0 
> [  985.904648] Oops: 0000 [#1] SMP PTI
> [  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
> [  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> [  985.927582] Workqueue: ext4discard ext4_discard_work
> [  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
> [  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> [  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> [  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> [  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> [  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> [  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> [  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> [  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> [  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> [  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  986.029742] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  986.037346] Call Trace:
> [  986.039958]  ? lock_acquire.part.0+0x5c/0x120
> [  986.044430]  ? process_one_work+0x1fb/0x590
> [  986.048844]  ? lock_is_held_type+0x98/0x100
> [  986.053742]  process_one_work+0x27b/0x590
> [  986.058146]  worker_thread+0x4e/0x300
> [  986.061953]  ? process_one_work+0x590/0x590
> [  986.066255]  kthread+0x13a/0x150
> [  986.069770]  ? __kthread_bind_mask+0x60/0x60
> [  986.074420]  ret_from_fork+0x22/0x30
> [  986.078269] CR2: 0000000000000040
> [  986.081703] ---[ end trace 01e941885177e812 ]---
> [  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
> [  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> [  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> [  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> [  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> [  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> [  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> [  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> [  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> [  986.161707] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> [  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
> [  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
> [  986.208016] INFO: lockdep is turned off.
> [  986.212055] irq event stamp: 559784
> [  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
> [  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
> [  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
> [  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
> [  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
> [  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> [  986.271275] Workqueue: ext4discard ext4_discard_work
> [  986.276382] Call Trace:
> [  986.279012]  dump_stack+0x6d/0x89
> [  986.283240]  ___might_sleep.cold+0xa6/0xb6
> [  986.287489]  exit_signals+0x1c/0x2c0
> [  986.291417]  do_exit+0xa6/0x3c0
> [  986.294672]  rewind_stack_do_exit+0x17/0x20
> [  986.299092] RIP: 0000:0x0
> [  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
> [  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
> [  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> [  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
> [  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> [  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.890856] #PF: supervisor read access in kernel mode
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.896241] #PF: error_code(0x0000) - not-present page
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.901665] PGD 0 P4D 0 
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.904648] Oops: 0000 [#1] SMP PTI
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.927582] Workqueue: ext4discard ext4_discard_work
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.029742] DR3: 0000000000000000 DR6: 0000[  986.603186] run fstests generic/043 at 2021-06-23 09:21:02
> 0000fffe0ff0 DR7: 0000000000000400
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.037346] Call Trace:
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.039958]  ? lock_acquire.part.0+0x5c/0x120
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.044430]  ? process_one_work+0x1fb/0x590
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.048844]  ? lock_is_held_type+0x98/0x100
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.053742]  process_one_work+0x27b/0x590
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.058146]  worker_thread+0x4e/0x300
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.061953]  ? process_one_work+0x590/0x590
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.066255]  kthread+0x13a/0x150
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.069770]  ? __kthread_bind_mask+0x60/0x60
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.074420]  ret_from_fork+0x22/0x30
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.078269] CR2: 0000000000000040
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.081703] ---[ end trace 01e941885177e812 ]---
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.161707] C[  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> S:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.208016] INFO: lockdep is turned off.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.212055] irq event stamp: 559784
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.271275] Workqueue: ext4discard ext4_discard_work
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.276382] Call Trace:
> Jun 23 09:21:02 xfstests-ltm-20210623090247-a[  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> a kernel: [  986.279012]  dump_stack+0x6d/0x89
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.283240]  ___might_sleep.cold+0xa6/0xb6
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.287489]  exit_signals+0x1c/0x2c0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.291417]  do_exit+0xa6/0x3c0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.294672]  rewind_stack_do_exit+0x17/0x20
> Jun 23 0[  987.102134] EXT4-fs (dm-1): shut down requested (1)
> 9:21:02 xfstests[  987.107927] Aborting journal on device dm-1-8.
> -ltm-20210623090247-aa kernel: [  986.299092] RIP: 0000:0x0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: fstests-generic-041.scope: Succeeded.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdb.mount: Succeeded.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdb.mount: Succeeded.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/043.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.102134] EXT4-fs (dm-1)[  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> : shut down requested (1)
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.107927] Aborting journal on device dm-1-8.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> [  999.135257] EXT4-fs (dm-1): shut down requested (2)
> [  999.140764] Aborting journal on device dm-1-8.
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.135257] EXT4-fs (dm-1): shut down requested (2)
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.140764] Aborting journal on device dm-1-8.
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> [  999.206122] EXT4-fs (dm-1): recovery complete
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
  2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
  2021-05-27 20:18             ` Andreas Dilger
  2021-06-22  0:55             ` Josh Triplett
@ 2023-09-06  0:11             ` Sarthak Kukreti
  2 siblings, 0 replies; 21+ messages in thread
From: Sarthak Kukreti @ 2023-09-06  0:11 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Andreas Dilger, linux-ext4, linux-kernel, lishujin

Hi,

I noticed a "regression" with this patch series on ChromiumOS: on a
ext4 filesystem (mounted with -o discard) on a thinly provisioned
block device (dm-thin/loopback on a sparse file), deletion of a large
file (O(10GB) or more) immediately followed by an unmount results in
the discards to the file's extents getting dropped with umount(). This
prevents space from getting reclaimed by the underlying
thin-provisioning layer (dm-thinpool/lower filesystem), even on
subsequent mounts of the filesystem (eg. after a reboot).

Currently, the only way to 'reclaim' this space into a thinpool is to
run fstrim on dm-thin filesystems on every mount: should the dropped
discard requests for newly freed blocks be requeued on subsequent
mounts of the filesystem?

Best
Sarthak

On Wed, May 26, 2021 at 1:44 AM Wang Jianchao <jianchao.wan9@gmail.com> wrote:
>
> Right now, discard is issued and waited to be completed in jbd2
> commit kthread context after the logs are committed. When large
> amount of files are deleted and discard is flooding, jbd2 commit
> kthread can be blocked for long time. Then all of the metadata
> operations can be blocked to wait the log space.
>
> One case is the page fault path with read mm->mmap_sem held, which
> wants to update the file time but has to wait for the log space.
> When other threads in the task wants to do mmap, then write mmap_sem
> is blocked. Finally all of the following read mmap_sem requirements
> are blocked, even the ps command which need to read the /proc/pid/
> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
> used to be blocked for 5 mins.
>
> This patch frees the blocks back to buddy after commit and then do
> discard in a async kworker context in fstrim fashion, namely,
>  - mark blocks to be discarded as used if they have not been allocated
>  - do discard
>  - mark them free
> After this, jbd2 commit kthread won't be blocked any more by discard
> and we won't get NOSPC even if the discard is slow or throttled.
>
> Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
> Suggested-by: Theodore Ts'o <tytso@mit.edu>
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
> ---
>  fs/ext4/ext4.h    |   2 +
>  fs/ext4/mballoc.c | 162 +++++++++++++++++++++++++++++++++---------------------
>  fs/ext4/mballoc.h |   3 -
>  3 files changed, 101 insertions(+), 66 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5c5c8e4..2e48c88 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1526,6 +1526,7 @@ struct ext4_sb_info {
>         unsigned int *s_mb_maxs;
>         unsigned int s_group_info_size;
>         atomic_t s_mb_free_pending;
> +       struct work_struct s_discard_work;
>
>         /* tunables */
>         unsigned long s_stripe;
> @@ -3306,6 +3307,7 @@ struct ext4_group_info {
>         unsigned long   bb_check_counter;
>  #endif
>         struct rb_root  bb_free_root;
> +       struct rb_root  bb_discard_root;
>         ext4_grpblk_t   bb_first_free;  /* first free block */
>         ext4_grpblk_t   bb_free;        /* total free blocks */
>         ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 15715e7..feea439 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -332,6 +332,7 @@
>  static struct kmem_cache *ext4_pspace_cachep;
>  static struct kmem_cache *ext4_ac_cachep;
>  static struct kmem_cache *ext4_free_data_cachep;
> +static struct workqueue_struct *ext4_discard_wq;
>
>  /* We create slab caches for groupinfo data structures based on the
>   * superblock block size.  There will be one per mounted filesystem for
> @@ -355,7 +356,10 @@ static inline struct ext4_free_data *efd_entry(struct rb_node *n)
>         return rb_entry_safe(n, struct ext4_free_data, efd_node);
>  }
>  static int ext4_insert_free_data(struct rb_root *root, struct ext4_free_data *nfd);
> -
> +static int ext4_try_to_trim_range(struct super_block *sb,
> +               struct ext4_buddy *e4b, ext4_grpblk_t start,
> +               ext4_grpblk_t max, ext4_grpblk_t minblocks);
> +static void ext4_discard_work(struct work_struct *work);
>  /*
>   * The algorithm using this percpu seq counter goes below:
>   * 1. We sample the percpu discard_pa_seq counter before trying for block
> @@ -2657,6 +2661,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
>         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
>         init_rwsem(&meta_group_info[i]->alloc_sem);
>         meta_group_info[i]->bb_free_root = RB_ROOT;
> +       meta_group_info[i]->bb_discard_root = RB_ROOT;
>         meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
>
>         mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
> @@ -2857,6 +2862,7 @@ int ext4_mb_init(struct super_block *sb)
>         spin_lock_init(&sbi->s_md_lock);
>         spin_lock_init(&sbi->s_bal_lock);
>         atomic_set(&sbi->s_mb_free_pending, 0);
> +       INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
>
>         sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
>         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
> @@ -2949,6 +2955,15 @@ int ext4_mb_release(struct super_block *sb)
>         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
>         int count;
>
> +       if (test_opt(sb, DISCARD)) {
> +               /*
> +                * wait the discard work to drain all of ext4_free_data
> +                */
> +               queue_work(ext4_discard_wq, &sbi->s_discard_work);
> +               flush_work(&sbi->s_discard_work);
> +       }
> +
> +       queue_work(ext4_discard_wq, &sbi->s_discard_work);
>         if (sbi->s_group_info) {
>                 for (i = 0; i < ngroups; i++) {
>                         cond_resched();
> @@ -3024,30 +3039,77 @@ static inline int ext4_issue_discard(struct super_block *sb,
>                 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
>  }
>
> -static void ext4_free_data_in_buddy(struct super_block *sb,
> -                                   struct ext4_free_data *entry)
> +static void ext4_discard_work(struct work_struct *work)
>  {
> +       struct ext4_sb_info *sbi = container_of(work,
> +                       struct ext4_sb_info, s_discard_work);
> +       struct super_block *sb = sbi->s_sb;
> +       ext4_group_t ngroups = ext4_get_groups_count(sb);
> +       struct ext4_group_info *grp;
> +       struct ext4_free_data *fd, *nfd;
>         struct ext4_buddy e4b;
> -       struct ext4_group_info *db;
> -       int err, count = 0, count2 = 0;
> +       int i, err;
> +
> +       for (i = 0; i < ngroups; i++) {
> +               grp = ext4_get_group_info(sb, i);
> +               if (RB_EMPTY_ROOT(&grp->bb_discard_root))
> +                       continue;
>
> -       mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
> -                entry->efd_count, entry->efd_group, entry);
> +               err = ext4_mb_load_buddy(sb, i, &e4b);
> +               if (err) {
> +                       ext4_warning(sb, "Error %d loading buddy information for %u",
> +                                       err, i);
> +                       continue;
> +               }
>
> -       err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
> -       /* we expect to find existing buddy because it's pinned */
> -       BUG_ON(err != 0);
> +               ext4_lock_group(sb, i);
> +               rbtree_postorder_for_each_entry_safe(fd, nfd,
> +                               &grp->bb_discard_root, efd_node) {
> +                       rb_erase(&fd->efd_node, &grp->bb_discard_root);
> +                       /*
> +                        * If filesystem is umounting, give up the discard
> +                        */
> +                       if (sb->s_flags & SB_ACTIVE)
> +                               ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
> +                                               fd->efd_start_cluster + fd->efd_count - 1, 1);
> +                       kmem_cache_free(ext4_free_data_cachep, fd);
> +               }
> +               ext4_unlock_group(sb, i);
> +               ext4_mb_unload_buddy(&e4b);
> +       }
> +}
>
> -       atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
> +static int ext4_free_data_in_buddy(struct super_block *sb,
> +               ext4_group_t group, tid_t commit_tid)
> +{
> +       struct ext4_buddy e4b;
> +       struct ext4_group_info *db;
> +       struct ext4_free_data *fd, *nfd;
> +       int cnt, nr_fd;
>
> +       /* we expect to find existing buddy because it's pinned */
> +       BUG_ON(ext4_mb_load_buddy(sb, group, &e4b));
>         db = e4b.bd_info;
> -       /* there are blocks to put in buddy to make them really free */
> -       count += entry->efd_count;
> -       count2++;
> -       ext4_lock_group(sb, entry->efd_group);
> -       /* Take it out of per group rb tree */
> -       rb_erase(&entry->efd_node, &(db->bb_free_root));
> -       mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
> +       cnt = 0;
> +       nr_fd = 0;
> +       ext4_lock_group(sb, group);
> +       rbtree_postorder_for_each_entry_safe(fd, nfd,
> +                       &db->bb_free_root, efd_node) {
> +               if (fd->efd_tid != commit_tid)
> +                       continue;
> +
> +               mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
> +                        fd->efd_count, fd->efd_group, fd);
> +               atomic_sub(fd->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
> +               cnt += fd->efd_count;
> +               nr_fd++;
> +               rb_erase(&fd->efd_node, &db->bb_free_root);
> +               mb_free_blocks(NULL, &e4b, fd->efd_start_cluster, fd->efd_count);
> +               if (test_opt(sb, DISCARD))
> +                       ext4_insert_free_data(&db->bb_discard_root, fd);
> +               else
> +                       kmem_cache_free(ext4_free_data_cachep, fd);
> +       }
>
>         /*
>          * Clear the trimmed flag for the group so that the next
> @@ -3055,22 +3117,22 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
>          * If the volume is mounted with -o discard, online discard
>          * is supported and the free blocks will be trimmed online.
>          */
> -       if (!test_opt(sb, DISCARD))
> +       if (!test_opt(sb, DISCARD) && cnt)
>                 EXT4_MB_GRP_CLEAR_TRIMMED(db);
>
> -       if (!db->bb_free_root.rb_node) {
> +       if (RB_EMPTY_ROOT(&db->bb_free_root) && cnt) {
>                 /* No more items in the per group rb tree
>                  * balance refcounts from ext4_mb_free_metadata()
>                  */
>                 put_page(e4b.bd_buddy_page);
>                 put_page(e4b.bd_bitmap_page);
>         }
> -       ext4_unlock_group(sb, entry->efd_group);
> -       kmem_cache_free(ext4_free_data_cachep, entry);
> +       ext4_unlock_group(sb, group);
>         ext4_mb_unload_buddy(&e4b);
>
> -       mb_debug(sb, "freed %d blocks in %d structures\n", count,
> -                count2);
> +       mb_debug(sb, "freed %d blocks in %d structures\n", cnt, nr_fd);
> +
> +       return nr_fd;
>  }
>
>  /*
> @@ -3081,52 +3143,21 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
>  {
>         struct ext4_sb_info *sbi = EXT4_SB(sb);
>         ext4_group_t ngroups = ext4_get_groups_count(sb);
> -       struct ext4_free_data *fd, *nfd;
>         struct ext4_group_info *grp;
> -       struct bio *discard_bio = NULL;
> -       struct list_head freed_data_list;
> -       int err, i;
> +       int i, nr;
>
>         if (!atomic_read(&sbi->s_mb_free_pending))
>                 return;
>
> -       INIT_LIST_HEAD(&freed_data_list);
> -       for (i = 0; i < ngroups; i++) {
> +       for (i = 0, nr = 0; i < ngroups; i++) {
>                 grp = ext4_get_group_info(sb, i);
> -               ext4_lock_group(sb, i);
> -               rbtree_postorder_for_each_entry_safe(fd, nfd,
> -                               &grp->bb_free_root, efd_node) {
> -                       if (fd->efd_tid != commit_tid)
> -                               continue;
> -                       INIT_LIST_HEAD(&fd->efd_list);
> -                       list_add_tail(&fd->efd_list, &freed_data_list);
> -               }
> -               ext4_unlock_group(sb, i);
> -       }
> -
> -       if (test_opt(sb, DISCARD)) {
> -               list_for_each_entry(fd, &freed_data_list, efd_list) {
> -                       err = ext4_issue_discard(sb, fd->efd_group,
> -                                                fd->efd_start_cluster,
> -                                                fd->efd_count,
> -                                                &discard_bio);
> -                       if (err && err != -EOPNOTSUPP) {
> -                               ext4_msg(sb, KERN_WARNING, "discard request in"
> -                                        " group:%d block:%d count:%d failed"
> -                                        " with %d", fd->efd_group,
> -                                        fd->efd_start_cluster, fd->efd_count, err);
> -                       } else if (err == -EOPNOTSUPP)
> -                               break;
> -               }
> -
> -               if (discard_bio) {
> -                       submit_bio_wait(discard_bio);
> -                       bio_put(discard_bio);
> -               }
> +               if (RB_EMPTY_ROOT(&grp->bb_free_root))
> +                       continue;
> +               nr += ext4_free_data_in_buddy(sb, i, commit_tid);
>         }
>
> -       list_for_each_entry_safe(fd, nfd, &freed_data_list, efd_list)
> -               ext4_free_data_in_buddy(sb, fd);
> +       if (nr && test_opt(sb, DISCARD))
> +               queue_work(ext4_discard_wq, &sbi->s_discard_work);
>  }
>
>  int __init ext4_init_mballoc(void)
> @@ -3146,8 +3177,13 @@ int __init ext4_init_mballoc(void)
>         if (ext4_free_data_cachep == NULL)
>                 goto out_ac_free;
>
> -       return 0;
> +       ext4_discard_wq = alloc_workqueue("ext4discard", WQ_UNBOUND, 0);
> +       if (!ext4_discard_wq)
> +               goto out_free_data;
>
> +       return 0;
> +out_free_data:
> +       kmem_cache_destroy(ext4_free_data_cachep);
>  out_ac_free:
>         kmem_cache_destroy(ext4_ac_cachep);
>  out_pa_free:
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index e75b474..d76286b 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -79,9 +79,6 @@
>  #define MB_DEFAULT_MAX_INODE_PREALLOC  512
>
>  struct ext4_free_data {
> -       /* this links the free block information from sb_info */
> -       struct list_head                efd_list;
> -
>         /* this links the free block information from group_info */
>         struct rb_node                  efd_node;
>
> --
> 1.8.3.1

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2023-09-06  0:11 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <164ffa3b-c4d5-6967-feba-b972995a6dfb@gmail.com>
2021-05-26  8:42 ` [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent Wang Jianchao
2021-05-27 19:47   ` Andreas Dilger
2021-06-23 13:13   ` Theodore Ts'o
2021-06-23 19:49     ` Theodore Ts'o
2021-07-02 10:27       ` Josh Triplett
2021-07-05 11:28       ` Wang Jianchao
     [not found] ` <a602a6ba-2073-8384-4c8f-d669ee25c065@gmail.com>
2021-05-26  8:43   ` [PATCH V2 2/7] ext4: add new helper interface ext4_try_to_trim_range() Wang Jianchao
2021-05-27 19:48     ` Andreas Dilger
     [not found]   ` <49382052-6238-f1fb-40d1-b6b801b39ff7@gmail.com>
2021-05-26  8:43     ` [PATCH V2 3/7] ext4: remove the repeated comment of ext4_trim_all_free Wang Jianchao
2021-05-27 19:49       ` Andreas Dilger
     [not found]     ` <48e33dea-d15e-f211-0191-e01bd3eb17b3@gmail.com>
2021-05-26  8:43       ` [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data Wang Jianchao
2021-05-27 20:09         ` Andreas Dilger
2021-05-28  3:40           ` Wang Jianchao
     [not found]       ` <67eeb65a-d413-c4f9-c06f-d5dcceca0e4f@gmail.com>
2021-05-26  8:43         ` [PATCH V2 5/7] ext4: get buddy cache after insert successfully Wang Jianchao
2021-06-23  3:06           ` Theodore Ts'o
     [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
2021-05-26  8:43           ` [PATCH V2 6/7] ext4: use bb_free_root to get the free data entry Wang Jianchao
2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
2021-05-27 20:18             ` Andreas Dilger
2021-05-28  3:06               ` Wang Jianchao
2021-06-22  0:55             ` Josh Triplett
2023-09-06  0:11             ` Sarthak Kukreti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.