[PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
       [not found] <164ffa3b-c4d5-6967-feba-b972995a6dfb@gmail.com>
@ 2021-05-26  8:42 ` Wang Jianchao
  2021-05-27 19:47   ` Andreas Dilger
  2021-06-23 13:13   ` Theodore Ts'o
       [not found] ` <a602a6ba-2073-8384-4c8f-d669ee25c065@gmail.com>
  1 sibling, 2 replies; 15+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:42 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Get rid of the 'group' parameter of ext4_trim_extent as we can get
it from the 'e4b'.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a02fadf..d81f1fd22 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5650,19 +5650,19 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
  * @sb:		super block for the file system
  * @start:	starting block of the free extent in the alloc. group
  * @count:	number of blocks to TRIM
- * @group:	alloc. group we are working with
  * @e4b:	ext4 buddy for the group
  *
  * Trim "count" blocks starting at "start" in the "group". To assure that no
  * one will allocate those blocks, mark it as used in buddy bitmap. This must
  * be called with under the group lock.
  */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
-			     ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb,
+		int start, int count, struct ext4_buddy *e4b)
 __releases(bitlock)
 __acquires(bitlock)
 {
 	struct ext4_free_extent ex;
+	ext4_group_t group = e4b->bd_group;
 	int ret = 0;
 
 	trace_ext4_trim_extent(sb, group, start, count);
@@ -5738,8 +5738,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 		next = mb_find_next_bit(bitmap, max + 1, start);
 
 		if ((next - start) >= minblocks) {
-			ret = ext4_trim_extent(sb, start,
-					       next - start, group, &e4b);
+			ret = ext4_trim_extent(sb, start, next - start, &e4b);
 			if (ret && ret != -EOPNOTSUPP)
 				break;
 			ret = 0;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH V2 2/7] ext4: add new helper interface ext4_try_to_trim_range()
       [not found] ` <a602a6ba-2073-8384-4c8f-d669ee25c065@gmail.com>
@ 2021-05-26  8:43   ` Wang Jianchao
       [not found]   ` <49382052-6238-f1fb-40d1-b6b801b39ff7@gmail.com>
  1 sibling, 0 replies; 15+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

There is no functional change in this patch but just split the
codes, which serachs free block and does trim, into a new function
ext4_try_to_trim_range. This is preparing for the following async
backgroup discard.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 102 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 57 insertions(+), 45 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d81f1fd22..f984f15 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5685,6 +5685,54 @@ static int ext4_trim_extent(struct super_block *sb,
 	return ret;
 }
 
+static int ext4_try_to_trim_range(struct super_block *sb,
+		struct ext4_buddy *e4b, ext4_grpblk_t start,
+		ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+	ext4_grpblk_t next, count, free_count;
+	void *bitmap;
+	int ret = 0;
+
+	bitmap = e4b->bd_bitmap;
+	start = (e4b->bd_info->bb_first_free > start) ?
+		e4b->bd_info->bb_first_free : start;
+	count = 0;
+	free_count = 0;
+
+	while (start <= max) {
+		start = mb_find_next_zero_bit(bitmap, max + 1, start);
+		if (start > max)
+			break;
+		next = mb_find_next_bit(bitmap, max + 1, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start, next - start, e4b);
+			if (ret && ret != -EOPNOTSUPP)
+				break;
+			ret = 0;
+			count += next - start;
+		}
+		free_count += next - start;
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, e4b->bd_group);
+			cond_resched();
+			ext4_lock_group(sb, e4b->bd_group);
+		}
+
+		if ((e4b->bd_info->bb_free - free_count) < minblocks)
+			break;
+	}
+
+	return count;
+}
+
 /**
  * ext4_trim_all_free -- function to trim all free space in alloc. group
  * @sb:			super block for file system
@@ -5708,10 +5756,8 @@ static int ext4_trim_extent(struct super_block *sb,
 		   ext4_grpblk_t start, ext4_grpblk_t max,
 		   ext4_grpblk_t minblocks)
 {
-	void *bitmap;
-	ext4_grpblk_t next, count = 0, free_count = 0;
 	struct ext4_buddy e4b;
-	int ret = 0;
+	int ret;
 
 	trace_ext4_trim_all_free(sb, group, start, max);
 
@@ -5721,57 +5767,23 @@ static int ext4_trim_extent(struct super_block *sb,
 			     ret, group);
 		return ret;
 	}
-	bitmap = e4b.bd_bitmap;
 
 	ext4_lock_group(sb, group);
-	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
-	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
-		goto out;
-
-	start = (e4b.bd_info->bb_first_free > start) ?
-		e4b.bd_info->bb_first_free : start;
 
-	while (start <= max) {
-		start = mb_find_next_zero_bit(bitmap, max + 1, start);
-		if (start > max)
-			break;
-		next = mb_find_next_bit(bitmap, max + 1, start);
-
-		if ((next - start) >= minblocks) {
-			ret = ext4_trim_extent(sb, start, next - start, &e4b);
-			if (ret && ret != -EOPNOTSUPP)
-				break;
-			ret = 0;
-			count += next - start;
-		}
-		free_count += next - start;
-		start = next + 1;
-
-		if (fatal_signal_pending(current)) {
-			count = -ERESTARTSYS;
-			break;
-		}
-
-		if (need_resched()) {
-			ext4_unlock_group(sb, group);
-			cond_resched();
-			ext4_lock_group(sb, group);
-		}
-
-		if ((e4b.bd_info->bb_free - free_count) < minblocks)
-			break;
+	if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+	    minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) {
+		ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
+		if (ret >= 0)
+			EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+	} else {
+		ret = 0;
 	}
 
-	if (!ret) {
-		ret = count;
-		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
-	}
-out:
 	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
 
 	ext4_debug("trimmed %d blocks in the group %d\n",
-		count, group);
+		ret, group);
 
 	return ret;
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH V2 3/7] ext4: remove the repeated comment of ext4_trim_all_free
       [not found]   ` <49382052-6238-f1fb-40d1-b6b801b39ff7@gmail.com>
@ 2021-05-26  8:43     ` Wang Jianchao
       [not found]     ` <48e33dea-d15e-f211-0191-e01bd3eb17b3@gmail.com>
  1 sibling, 0 replies; 15+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f984f15..85418cf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5741,15 +5741,10 @@ static int ext4_try_to_trim_range(struct super_block *sb,
  * @max:		last group block to examine
  * @minblocks:		minimum extent block count
  *
- * ext4_trim_all_free walks through group's buddy bitmap searching for free
- * extents. When the free block is found, ext4_trim_extent is called to TRIM
- * the extent.
- *
- *
  * ext4_trim_all_free walks through group's block bitmap searching for free
  * extents. When the free extent is found, mark it as used in group buddy
  * bitmap. Then issue a TRIM command on this extent and free the extent in
- * the group buddy bitmap. This is done until whole group is scanned.
+ * the group buddy bitmap.
  */
 static ext4_grpblk_t
 ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data
       [not found]     ` <48e33dea-d15e-f211-0191-e01bd3eb17b3@gmail.com>
@ 2021-05-26  8:43       ` Wang Jianchao
       [not found]       ` <67eeb65a-d413-c4f9-c06f-d5dcceca0e4f@gmail.com>
  1 sibling, 0 replies; 15+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Split the codes that inserts and merges ext4_free_data structures
into a new interface ext4_insert_free_data. This is preparing for
following async background discard.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 96 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 51 insertions(+), 45 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 85418cf..16f06d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -350,6 +350,12 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+static inline struct ext4_free_data *efd_entry(struct rb_node *n)
+{
+	return rb_entry_safe(n, struct ext4_free_data, efd_node);
+}
+static int ext4_insert_free_data(struct ext4_sb_info *sbi,
+		struct rb_root *root, struct ext4_free_data *nfd);
 
 /*
  * The algorithm using this percpu seq counter goes below:
@@ -5069,28 +5075,53 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
 	kmem_cache_free(ext4_free_data_cachep, entry);
 }
 
+static int ext4_insert_free_data(struct ext4_sb_info *sbi,
+		struct rb_root *root, struct ext4_free_data *nfd)
+{
+	struct rb_node **n = &root->rb_node;
+	struct rb_node *p = NULL;
+	struct ext4_free_data *fd;
+
+	while (*n) {
+		p = *n;
+		fd = rb_entry(p, struct ext4_free_data, efd_node);
+		if (nfd->efd_start_cluster < fd->efd_start_cluster)
+			n = &(*n)->rb_left;
+		else if (nfd->efd_start_cluster >=
+			 (fd->efd_start_cluster + fd->efd_count))
+			n = &(*n)->rb_right;
+		else
+			return -EINVAL;
+	}
+
+	rb_link_node(&nfd->efd_node, p, n);
+	rb_insert_color(&nfd->efd_node, root);
+
+	/* Now try to see the extent can be merged to left and right */
+	fd = efd_entry(rb_prev(&nfd->efd_node));
+	if (fd)
+		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+
+	fd = efd_entry(rb_next(&nfd->efd_node));
+	if (fd)
+		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+
+	return 0;
+}
+
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-		      struct ext4_free_data *new_entry)
+		      struct ext4_free_data *nfd)
 {
-	ext4_group_t group = e4b->bd_group;
-	ext4_grpblk_t cluster;
-	ext4_grpblk_t clusters = new_entry->efd_count;
-	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct rb_node **n = &db->bb_free_root.rb_node, *node;
-	struct rb_node *parent = NULL, *new_node;
 
 	BUG_ON(!ext4_handle_valid(handle));
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	new_node = &new_entry->efd_node;
-	cluster = new_entry->efd_start_cluster;
-
-	if (!*n) {
+	if (!db->bb_free_root.rb_node) {
 		/* first free block exent. We need to
 		   protect buddy cache from being freed,
 		 * otherwise we'll refresh it from
@@ -5099,44 +5130,19 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
 		get_page(e4b->bd_buddy_page);
 		get_page(e4b->bd_bitmap_page);
 	}
-	while (*n) {
-		parent = *n;
-		entry = rb_entry(parent, struct ext4_free_data, efd_node);
-		if (cluster < entry->efd_start_cluster)
-			n = &(*n)->rb_left;
-		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
-			n = &(*n)->rb_right;
-		else {
-			ext4_grp_locked_error(sb, group, 0,
-				ext4_group_first_block_no(sb, group) +
-				EXT4_C2B(sbi, cluster),
-				"Block already on to-be-freed list");
-			kmem_cache_free(ext4_free_data_cachep, new_entry);
-			return 0;
-		}
-	}
-
-	rb_link_node(new_node, parent, n);
-	rb_insert_color(new_node, &db->bb_free_root);
-
-	/* Now try to see the extent can be merged to left and right */
-	node = rb_prev(new_node);
-	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, efd_node);
-		ext4_try_merge_freed_extent(sbi, entry, new_entry,
-					    &(db->bb_free_root));
-	}
 
-	node = rb_next(new_node);
-	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, efd_node);
-		ext4_try_merge_freed_extent(sbi, entry, new_entry,
-					    &(db->bb_free_root));
+	if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
+		ext4_grp_locked_error(sb, e4b->bd_group, 0,
+				ext4_group_first_block_no(sb, e4b->bd_group) +
+				EXT4_C2B(sbi, nfd->efd_start_cluster),
+				"Block already on to-be-freed list");
+		kmem_cache_free(ext4_free_data_cachep, nfd);
+		return 0;
 	}
 
 	spin_lock(&sbi->s_md_lock);
-	list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
-	sbi->s_mb_free_pending += clusters;
+	list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
+	sbi->s_mb_free_pending += nfd->efd_count;
 	spin_unlock(&sbi->s_md_lock);
 	return 0;
 }
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH V2 5/7] ext4: get buddy cache after insert successfully
       [not found]       ` <67eeb65a-d413-c4f9-c06f-d5dcceca0e4f@gmail.com>
@ 2021-05-26  8:43         ` Wang Jianchao
  2021-06-23  3:06           ` Theodore Ts'o
       [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
  1 sibling, 1 reply; 15+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

The getting of bd_bitmap_page and bd_buddy_page should be done
after insert the ext4_free_data successfully. Otherwise, nobody
can put them.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/mballoc.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b23010c..3e8ad43 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5116,21 +5116,12 @@ static int ext4_insert_free_data( struct ext4_sb_info *sbi,
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	bool first = RB_EMPTY_ROOT(&db->bb_free_root);
 
 	BUG_ON(!ext4_handle_valid(handle));
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	if (!db->bb_free_root.rb_node) {
-		/* first free block exent. We need to
-		   protect buddy cache from being freed,
-		 * otherwise we'll refresh it from
-		 * on-disk bitmap and lose not-yet-available
-		 * blocks */
-		get_page(e4b->bd_buddy_page);
-		get_page(e4b->bd_bitmap_page);
-	}
-
 	if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
 		ext4_grp_locked_error(sb, e4b->bd_group, 0,
 				ext4_group_first_block_no(sb, e4b->bd_group) +
@@ -5140,6 +5131,15 @@ static int ext4_insert_free_data( struct ext4_sb_info *sbi,
 		return 0;
 	}
 
+	if (first) {
+		/* first free block exent. We need to protect buddy
+		 * cache from being freed, otherwise we'll refresh it
+		 * from on-disk bitmap and lose not-yet-available blocks
+		 */
+		get_page(e4b->bd_buddy_page);
+		get_page(e4b->bd_bitmap_page);
+	}
+
 	spin_lock(&sbi->s_md_lock);
 	list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
 	sbi->s_mb_free_pending += nfd->efd_count;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH V2 6/7] ext4: use bb_free_root to get the free data entry
       [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
@ 2021-05-26  8:43           ` Wang Jianchao
  2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
  1 sibling, 0 replies; 15+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:43 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

This is also preparing for following async background discard.
In this patch, the s_freed_data_list is removed and we iterate
all of the group's free_data_root rb tree to get the entry.
After this, we needn't operate it when insert and merge free
data entry any more.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/balloc.c  |   2 +-
 fs/ext4/ext4.h    |   4 +--
 fs/ext4/mballoc.c | 104 +++++++++++++++++++++++++-----------------------------
 3 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 74a5172..8053a5f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -652,7 +652,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	 * possible we just missed a transaction commit that did so
 	 */
 	smp_mb();
-	if (sbi->s_mb_free_pending == 0)
+	if (!atomic_read(&sbi->s_mb_free_pending))
 		return ext4_has_free_clusters(sbi, 1, 0);
 
 	/*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 826a56e3..5c5c8e4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1525,9 +1525,7 @@ struct ext4_sb_info {
 	unsigned short *s_mb_offsets;
 	unsigned int *s_mb_maxs;
 	unsigned int s_group_info_size;
-	unsigned int s_mb_free_pending;
-	struct list_head s_freed_data_list;	/* List of blocks to be freed
-						   after commit completed */
+	atomic_t s_mb_free_pending;
 
 	/* tunables */
 	unsigned long s_stripe;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c2bf40a..15715e7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -354,8 +354,7 @@ static inline struct ext4_free_data *efd_entry(struct rb_node *n)
 {
 	return rb_entry_safe(n, struct ext4_free_data, efd_node);
 }
-static int ext4_insert_free_data(struct ext4_sb_info *sbi,
-		struct rb_root *root, struct ext4_free_data *nfd);
+static int ext4_insert_free_data(struct rb_root *root, struct ext4_free_data *nfd);
 
 /*
  * The algorithm using this percpu seq counter goes below:
@@ -2857,8 +2856,7 @@ int ext4_mb_init(struct super_block *sb)
 
 	spin_lock_init(&sbi->s_md_lock);
 	spin_lock_init(&sbi->s_bal_lock);
-	sbi->s_mb_free_pending = 0;
-	INIT_LIST_HEAD(&sbi->s_freed_data_list);
+	atomic_set(&sbi->s_mb_free_pending, 0);
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -3040,9 +3038,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 	/* we expect to find existing buddy because it's pinned */
 	BUG_ON(err != 0);
 
-	spin_lock(&EXT4_SB(sb)->s_md_lock);
-	EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
-	spin_unlock(&EXT4_SB(sb)->s_md_lock);
+	atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
 
 	db = e4b.bd_info;
 	/* there are blocks to put in buddy to make them really free */
@@ -3084,37 +3080,41 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_data *entry, *tmp;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_free_data *fd, *nfd;
+	struct ext4_group_info *grp;
 	struct bio *discard_bio = NULL;
 	struct list_head freed_data_list;
-	struct list_head *cut_pos = NULL;
-	int err;
+	int err, i;
 
-	INIT_LIST_HEAD(&freed_data_list);
+	if (!atomic_read(&sbi->s_mb_free_pending))
+		return;
 
-	spin_lock(&sbi->s_md_lock);
-	list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
-		if (entry->efd_tid != commit_tid)
-			break;
-		cut_pos = &entry->efd_list;
+	INIT_LIST_HEAD(&freed_data_list);
+	for (i = 0; i < ngroups; i++) {
+		grp = ext4_get_group_info(sb, i);
+		ext4_lock_group(sb, i);
+		rbtree_postorder_for_each_entry_safe(fd, nfd,
+				&grp->bb_free_root, efd_node) {
+			if (fd->efd_tid != commit_tid)
+				continue;
+			INIT_LIST_HEAD(&fd->efd_list);
+			list_add_tail(&fd->efd_list, &freed_data_list);
+		}
+		ext4_unlock_group(sb, i);
 	}
-	if (cut_pos)
-		list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
-				  cut_pos);
-	spin_unlock(&sbi->s_md_lock);
 
 	if (test_opt(sb, DISCARD)) {
-		list_for_each_entry(entry, &freed_data_list, efd_list) {
-			err = ext4_issue_discard(sb, entry->efd_group,
-						 entry->efd_start_cluster,
-						 entry->efd_count,
+		list_for_each_entry(fd, &freed_data_list, efd_list) {
+			err = ext4_issue_discard(sb, fd->efd_group,
+						 fd->efd_start_cluster,
+						 fd->efd_count,
 						 &discard_bio);
 			if (err && err != -EOPNOTSUPP) {
 				ext4_msg(sb, KERN_WARNING, "discard request in"
 					 " group:%d block:%d count:%d failed"
-					 " with %d", entry->efd_group,
-					 entry->efd_start_cluster,
-					 entry->efd_count, err);
+					 " with %d", fd->efd_group,
+					 fd->efd_start_cluster, fd->efd_count, err);
 			} else if (err == -EOPNOTSUPP)
 				break;
 		}
@@ -3125,8 +3125,8 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 		}
 	}
 
-	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
-		ext4_free_data_in_buddy(sb, entry);
+	list_for_each_entry_safe(fd, nfd, &freed_data_list, efd_list)
+		ext4_free_data_in_buddy(sb, fd);
 }
 
 int __init ext4_init_mballoc(void)
@@ -5051,32 +5051,27 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
  * are contiguous, AND the extents were freed by the same transaction,
  * AND the blocks are associated with the same group.
  */
-static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
-					struct ext4_free_data *entry,
-					struct ext4_free_data *new_entry,
-					struct rb_root *entry_rb_root)
+static void ext4_try_merge_freed_extent(struct rb_root *root,
+	struct ext4_free_data *fd, struct ext4_free_data *nfd)
 {
-	if ((entry->efd_tid != new_entry->efd_tid) ||
-	    (entry->efd_group != new_entry->efd_group))
+	if ((fd->efd_tid != nfd->efd_tid) ||
+	    (fd->efd_group != nfd->efd_group))
 		return;
-	if (entry->efd_start_cluster + entry->efd_count ==
-	    new_entry->efd_start_cluster) {
-		new_entry->efd_start_cluster = entry->efd_start_cluster;
-		new_entry->efd_count += entry->efd_count;
-	} else if (new_entry->efd_start_cluster + new_entry->efd_count ==
-		   entry->efd_start_cluster) {
-		new_entry->efd_count += entry->efd_count;
+	if (fd->efd_start_cluster + fd->efd_count ==
+	    nfd->efd_start_cluster) {
+		nfd->efd_start_cluster = fd->efd_start_cluster;
+		nfd->efd_count += fd->efd_count;
+	} else if (nfd->efd_start_cluster + nfd->efd_count ==
+		   fd->efd_start_cluster) {
+		nfd->efd_count += fd->efd_count;
 	} else
 		return;
-	spin_lock(&sbi->s_md_lock);
-	list_del(&entry->efd_list);
-	spin_unlock(&sbi->s_md_lock);
-	rb_erase(&entry->efd_node, entry_rb_root);
-	kmem_cache_free(ext4_free_data_cachep, entry);
+	rb_erase(&fd->efd_node, root);
+	kmem_cache_free(ext4_free_data_cachep, fd);
 }
 
-static int ext4_insert_free_data(struct ext4_sb_info *sbi,
-		struct rb_root *root, struct ext4_free_data *nfd)
+static int ext4_insert_free_data(struct rb_root *root,
+		struct ext4_free_data *nfd)
 {
 	struct rb_node **n = &root->rb_node;
 	struct rb_node *p = NULL;
@@ -5100,11 +5095,11 @@ static int ext4_insert_free_data(struct ext4_sb_info *sbi,
 	/* Now try to see the extent can be merged to left and right */
 	fd = efd_entry(rb_prev(&nfd->efd_node));
 	if (fd)
-		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+		ext4_try_merge_freed_extent(root, fd, nfd);
 
 	fd = efd_entry(rb_next(&nfd->efd_node));
 	if (fd)
-		ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+		ext4_try_merge_freed_extent(root, fd, nfd);
 
 	return 0;
 }
@@ -5122,7 +5117,7 @@ static int ext4_insert_free_data(struct ext4_sb_info *sbi,
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
+	if (ext4_insert_free_data(&db->bb_free_root, nfd)) {
 		ext4_grp_locked_error(sb, e4b->bd_group, 0,
 				ext4_group_first_block_no(sb, e4b->bd_group) +
 				EXT4_C2B(sbi, nfd->efd_start_cluster),
@@ -5140,10 +5135,7 @@ static int ext4_insert_free_data(struct ext4_sb_info *sbi,
 		get_page(e4b->bd_bitmap_page);
 	}
 
-	spin_lock(&sbi->s_md_lock);
-	list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
-	sbi->s_mb_free_pending += nfd->efd_count;
-	spin_unlock(&sbi->s_md_lock);
+	atomic_add(nfd->efd_count, &sbi->s_mb_free_pending);
 	return 0;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
       [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
  2021-05-26  8:43           ` [PATCH V2 6/7] ext4: use bb_free_root to get the free data entry Wang Jianchao
@ 2021-05-26  8:44           ` Wang Jianchao
  2021-06-22  0:55             ` Josh Triplett
  2023-09-06  0:11             ` Sarthak Kukreti
  1 sibling, 2 replies; 15+ messages in thread
From: Wang Jianchao @ 2021-05-26  8:44 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, lishujin

Right now, discard is issued and waited to be completed in jbd2
commit kthread context after the logs are committed. When large
amount of files are deleted and discard is flooding, jbd2 commit
kthread can be blocked for long time. Then all of the metadata
operations can be blocked to wait the log space.

One case is the page fault path with read mm->mmap_sem held, which
wants to update the file time but has to wait for the log space.
When other threads in the task wants to do mmap, then write mmap_sem
is blocked. Finally all of the following read mmap_sem requirements
are blocked, even the ps command which need to read the /proc/pid/
-cmdline. Our monitor service which needs to read /proc/pid/cmdline
used to be blocked for 5 mins.

This patch frees the blocks back to buddy after commit and then do
discard in a async kworker context in fstrim fashion, namely,
 - mark blocks to be discarded as used if they have not been allocated
 - do discard
 - mark them free
After this, jbd2 commit kthread won't be blocked any more by discard
and we won't get NOSPC even if the discard is slow or throttled.

Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
Suggested-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/ext4.h    |   2 +
 fs/ext4/mballoc.c | 162 +++++++++++++++++++++++++++++++++---------------------
 fs/ext4/mballoc.h |   3 -
 3 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5c5c8e4..2e48c88 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1526,6 +1526,7 @@ struct ext4_sb_info {
 	unsigned int *s_mb_maxs;
 	unsigned int s_group_info_size;
 	atomic_t s_mb_free_pending;
+	struct work_struct s_discard_work;
 
 	/* tunables */
 	unsigned long s_stripe;
@@ -3306,6 +3307,7 @@ struct ext4_group_info {
 	unsigned long	bb_check_counter;
 #endif
 	struct rb_root  bb_free_root;
+	struct rb_root  bb_discard_root;
 	ext4_grpblk_t	bb_first_free;	/* first free block */
 	ext4_grpblk_t	bb_free;	/* total free blocks */
 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 15715e7..feea439 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -332,6 +332,7 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_data_cachep;
+static struct workqueue_struct *ext4_discard_wq;
 
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
@@ -355,7 +356,10 @@ static inline struct ext4_free_data *efd_entry(struct rb_node *n)
 	return rb_entry_safe(n, struct ext4_free_data, efd_node);
 }
 static int ext4_insert_free_data(struct rb_root *root, struct ext4_free_data *nfd);
-
+static int ext4_try_to_trim_range(struct super_block *sb,
+		struct ext4_buddy *e4b, ext4_grpblk_t start,
+		ext4_grpblk_t max, ext4_grpblk_t minblocks);
+static void ext4_discard_work(struct work_struct *work);
 /*
  * The algorithm using this percpu seq counter goes below:
  * 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -2657,6 +2661,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_discard_root = RB_ROOT;
 	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 
 	mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
@@ -2857,6 +2862,7 @@ int ext4_mb_init(struct super_block *sb)
 	spin_lock_init(&sbi->s_md_lock);
 	spin_lock_init(&sbi->s_bal_lock);
 	atomic_set(&sbi->s_mb_free_pending, 0);
+	INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -2949,6 +2955,15 @@ int ext4_mb_release(struct super_block *sb)
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 	int count;
 
+	if (test_opt(sb, DISCARD)) {
+		/*
+		 * wait the discard work to drain all of ext4_free_data
+		 */
+		queue_work(ext4_discard_wq, &sbi->s_discard_work);
+		flush_work(&sbi->s_discard_work);
+	}
+
+	queue_work(ext4_discard_wq, &sbi->s_discard_work);
 	if (sbi->s_group_info) {
 		for (i = 0; i < ngroups; i++) {
 			cond_resched();
@@ -3024,30 +3039,77 @@ static inline int ext4_issue_discard(struct super_block *sb,
 		return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 }
 
-static void ext4_free_data_in_buddy(struct super_block *sb,
-				    struct ext4_free_data *entry)
+static void ext4_discard_work(struct work_struct *work)
 {
+	struct ext4_sb_info *sbi = container_of(work,
+			struct ext4_sb_info, s_discard_work);
+	struct super_block *sb = sbi->s_sb;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_info *grp;
+	struct ext4_free_data *fd, *nfd;
 	struct ext4_buddy e4b;
-	struct ext4_group_info *db;
-	int err, count = 0, count2 = 0;
+	int i, err;
+
+	for (i = 0; i < ngroups; i++) {
+		grp = ext4_get_group_info(sb, i);
+		if (RB_EMPTY_ROOT(&grp->bb_discard_root))
+			continue;
 
-	mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
-		 entry->efd_count, entry->efd_group, entry);
+		err = ext4_mb_load_buddy(sb, i, &e4b);
+		if (err) {
+			ext4_warning(sb, "Error %d loading buddy information for %u",
+					err, i);
+			continue;
+		}
 
-	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
-	/* we expect to find existing buddy because it's pinned */
-	BUG_ON(err != 0);
+		ext4_lock_group(sb, i);
+		rbtree_postorder_for_each_entry_safe(fd, nfd,
+				&grp->bb_discard_root, efd_node) {
+			rb_erase(&fd->efd_node, &grp->bb_discard_root);
+			/*
+			 * If filesystem is umounting, give up the discard
+			 */
+			if (sb->s_flags & SB_ACTIVE)
+				ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
+						fd->efd_start_cluster + fd->efd_count - 1, 1);
+			kmem_cache_free(ext4_free_data_cachep, fd);
+		}
+		ext4_unlock_group(sb, i);
+		ext4_mb_unload_buddy(&e4b);
+	}
+}
 
-	atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
+static int ext4_free_data_in_buddy(struct super_block *sb,
+		ext4_group_t group, tid_t commit_tid)
+{
+	struct ext4_buddy e4b;
+	struct ext4_group_info *db;
+	struct ext4_free_data *fd, *nfd;
+	int cnt, nr_fd;
 
+	/* we expect to find existing buddy because it's pinned */
+	BUG_ON(ext4_mb_load_buddy(sb, group, &e4b));
 	db = e4b.bd_info;
-	/* there are blocks to put in buddy to make them really free */
-	count += entry->efd_count;
-	count2++;
-	ext4_lock_group(sb, entry->efd_group);
-	/* Take it out of per group rb tree */
-	rb_erase(&entry->efd_node, &(db->bb_free_root));
-	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
+	cnt = 0;
+	nr_fd = 0;
+	ext4_lock_group(sb, group);
+	rbtree_postorder_for_each_entry_safe(fd, nfd,
+			&db->bb_free_root, efd_node) {
+		if (fd->efd_tid != commit_tid)
+			continue;
+
+		mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
+			 fd->efd_count, fd->efd_group, fd);
+		atomic_sub(fd->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
+		cnt += fd->efd_count;
+		nr_fd++;
+		rb_erase(&fd->efd_node, &db->bb_free_root);
+		mb_free_blocks(NULL, &e4b, fd->efd_start_cluster, fd->efd_count);
+		if (test_opt(sb, DISCARD))
+			ext4_insert_free_data(&db->bb_discard_root, fd);
+		else
+			kmem_cache_free(ext4_free_data_cachep, fd);
+	}
 
 	/*
 	 * Clear the trimmed flag for the group so that the next
@@ -3055,22 +3117,22 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 	 * If the volume is mounted with -o discard, online discard
 	 * is supported and the free blocks will be trimmed online.
 	 */
-	if (!test_opt(sb, DISCARD))
+	if (!test_opt(sb, DISCARD) && cnt)
 		EXT4_MB_GRP_CLEAR_TRIMMED(db);
 
-	if (!db->bb_free_root.rb_node) {
+	if (RB_EMPTY_ROOT(&db->bb_free_root) && cnt) {
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
 		 */
 		put_page(e4b.bd_buddy_page);
 		put_page(e4b.bd_bitmap_page);
 	}
-	ext4_unlock_group(sb, entry->efd_group);
-	kmem_cache_free(ext4_free_data_cachep, entry);
+	ext4_unlock_group(sb, group);
 	ext4_mb_unload_buddy(&e4b);
 
-	mb_debug(sb, "freed %d blocks in %d structures\n", count,
-		 count2);
+	mb_debug(sb, "freed %d blocks in %d structures\n", cnt, nr_fd);
+
+	return nr_fd;
 }
 
 /*
@@ -3081,52 +3143,21 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
-	struct ext4_free_data *fd, *nfd;
 	struct ext4_group_info *grp;
-	struct bio *discard_bio = NULL;
-	struct list_head freed_data_list;
-	int err, i;
+	int i, nr;
 
 	if (!atomic_read(&sbi->s_mb_free_pending))
 		return;
 
-	INIT_LIST_HEAD(&freed_data_list);
-	for (i = 0; i < ngroups; i++) {
+	for (i = 0, nr = 0; i < ngroups; i++) {
 		grp = ext4_get_group_info(sb, i);
-		ext4_lock_group(sb, i);
-		rbtree_postorder_for_each_entry_safe(fd, nfd,
-				&grp->bb_free_root, efd_node) {
-			if (fd->efd_tid != commit_tid)
-				continue;
-			INIT_LIST_HEAD(&fd->efd_list);
-			list_add_tail(&fd->efd_list, &freed_data_list);
-		}
-		ext4_unlock_group(sb, i);
-	}
-
-	if (test_opt(sb, DISCARD)) {
-		list_for_each_entry(fd, &freed_data_list, efd_list) {
-			err = ext4_issue_discard(sb, fd->efd_group,
-						 fd->efd_start_cluster,
-						 fd->efd_count,
-						 &discard_bio);
-			if (err && err != -EOPNOTSUPP) {
-				ext4_msg(sb, KERN_WARNING, "discard request in"
-					 " group:%d block:%d count:%d failed"
-					 " with %d", fd->efd_group,
-					 fd->efd_start_cluster, fd->efd_count, err);
-			} else if (err == -EOPNOTSUPP)
-				break;
-		}
-
-		if (discard_bio) {
-			submit_bio_wait(discard_bio);
-			bio_put(discard_bio);
-		}
+		if (RB_EMPTY_ROOT(&grp->bb_free_root))
+			continue;
+		nr += ext4_free_data_in_buddy(sb, i, commit_tid);
 	}
 
-	list_for_each_entry_safe(fd, nfd, &freed_data_list, efd_list)
-		ext4_free_data_in_buddy(sb, fd);
+	if (nr && test_opt(sb, DISCARD))
+		queue_work(ext4_discard_wq, &sbi->s_discard_work);
 }
 
 int __init ext4_init_mballoc(void)
@@ -3146,8 +3177,13 @@ int __init ext4_init_mballoc(void)
 	if (ext4_free_data_cachep == NULL)
 		goto out_ac_free;
 
-	return 0;
+	ext4_discard_wq = alloc_workqueue("ext4discard", WQ_UNBOUND, 0);
+	if (!ext4_discard_wq)
+		goto out_free_data;
 
+	return 0;
+out_free_data:
+	kmem_cache_destroy(ext4_free_data_cachep);
 out_ac_free:
 	kmem_cache_destroy(ext4_ac_cachep);
 out_pa_free:
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index e75b474..d76286b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -79,9 +79,6 @@
 #define MB_DEFAULT_MAX_INODE_PREALLOC	512
 
 struct ext4_free_data {
-	/* this links the free block information from sb_info */
-	struct list_head		efd_list;
-
 	/* this links the free block information from group_info */
 	struct rb_node			efd_node;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-05-26  8:42 ` [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent Wang Jianchao
@ 2021-05-27 19:47   ` Andreas Dilger
  2021-06-23 13:13   ` Theodore Ts'o
  1 sibling, 0 replies; 15+ messages in thread
From: Andreas Dilger @ 2021-05-27 19:47 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Ext4 Developers List,
	Linux Kernel Mailing List, lishujin

[-- Attachment #1: Type: text/plain, Size: 1980 bytes --]


> On May 26, 2021, at 2:42 AM, Wang Jianchao <jianchao.wan9@gmail.com> wrote:
> 
> Get rid of the 'group' parameter of ext4_trim_extent as we can get
> it from the 'e4b'.
> 
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

Reviewed-by: Andreas Dilger <adilger@dilger.ca>

> ---
> fs/ext4/mballoc.c | 9 ++++-----
> 1 file changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index a02fadf..d81f1fd22 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -5650,19 +5650,19 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
>  * @sb:		super block for the file system
>  * @start:	starting block of the free extent in the alloc. group
>  * @count:	number of blocks to TRIM
> - * @group:	alloc. group we are working with
>  * @e4b:	ext4 buddy for the group
>  *
>  * Trim "count" blocks starting at "start" in the "group". To assure that no
>  * one will allocate those blocks, mark it as used in buddy bitmap. This must
>  * be called with under the group lock.
>  */
> -static int ext4_trim_extent(struct super_block *sb, int start, int count,
> -			     ext4_group_t group, struct ext4_buddy *e4b)
> +static int ext4_trim_extent(struct super_block *sb,
> +		int start, int count, struct ext4_buddy *e4b)
> __releases(bitlock)
> __acquires(bitlock)
> {
> 	struct ext4_free_extent ex;
> +	ext4_group_t group = e4b->bd_group;
> 	int ret = 0;
> 
> 	trace_ext4_trim_extent(sb, group, start, count);
> @@ -5738,8 +5738,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
> 		next = mb_find_next_bit(bitmap, max + 1, start);
> 
> 		if ((next - start) >= minblocks) {
> -			ret = ext4_trim_extent(sb, start,
> -					       next - start, group, &e4b);
> +			ret = ext4_trim_extent(sb, start, next - start, &e4b);
> 			if (ret && ret != -EOPNOTSUPP)
> 				break;
> 			ret = 0;
> --
> 1.8.3.1


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
  2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
@ 2021-06-22  0:55             ` Josh Triplett
  2023-09-06  0:11             ` Sarthak Kukreti
  1 sibling, 0 replies; 15+ messages in thread
From: Josh Triplett @ 2021-06-22  0:55 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Andreas Dilger, linux-ext4, linux-kernel, lishujin

On Wed, May 26, 2021 at 04:44:06PM +0800, Wang Jianchao wrote:
> Right now, discard is issued and waited to be completed in jbd2
> commit kthread context after the logs are committed. When large
> amount of files are deleted and discard is flooding, jbd2 commit
> kthread can be blocked for long time. Then all of the metadata
> operations can be blocked to wait the log space.
> 
> One case is the page fault path with read mm->mmap_sem held, which
> wants to update the file time but has to wait for the log space.
> When other threads in the task wants to do mmap, then write mmap_sem
> is blocked. Finally all of the following read mmap_sem requirements
> are blocked, even the ps command which need to read the /proc/pid/
> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
> used to be blocked for 5 mins.
> 
> This patch frees the blocks back to buddy after commit and then do
> discard in a async kworker context in fstrim fashion, namely,
>  - mark blocks to be discarded as used if they have not been allocated
>  - do discard
>  - mark them free
> After this, jbd2 commit kthread won't be blocked any more by discard
> and we won't get NOSPC even if the discard is slow or throttled.
> 
> Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
> Suggested-by: Theodore Ts'o <tytso@mit.edu>
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

I tested this patch series atop 5.13.0-rc7 (which took a bit of
rebasing), and it works quite well. I tested ext4 on dm-crypt on a
laptop NVME drive, using this test case:

$ mkdir testdir
$ time python3 -c 'for i in range(1000000): open(f"testdir/{i}", "wb").write(b"test data")'
$ time rm -r testdir

I tried this with and without discard enabled.

Without this patch, if discard is enabled, the rm takes many minutes,
and stalls out the rest of the system.

With this patch, the rm takes the same amount of time (~17s) whether
discard is enabled or disabled, and the rest of the system continues to
function.

Tested-by: Josh Triplett <josh@joshtriplett.org>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 5/7] ext4: get buddy cache after insert successfully
  2021-05-26  8:43         ` [PATCH V2 5/7] ext4: get buddy cache after insert successfully Wang Jianchao
@ 2021-06-23  3:06           ` Theodore Ts'o
  0 siblings, 0 replies; 15+ messages in thread
From: Theodore Ts'o @ 2021-06-23  3:06 UTC (permalink / raw)
  To: Wang Jianchao; +Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin

On Wed, May 26, 2021 at 04:43:43PM +0800, Wang Jianchao wrote:
> The getting of bd_bitmap_page and bd_buddy_page should be done
> after insert the ext4_free_data successfully. Otherwise, nobody
> can put them.
> 
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>

This looks like it's a bug fix for patch 4/7 --- is that correct?  If
so, maybe we should fold this into the previous patch?

Thanks,

						- Ted

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-05-26  8:42 ` [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent Wang Jianchao
  2021-05-27 19:47   ` Andreas Dilger
@ 2021-06-23 13:13   ` Theodore Ts'o
  2021-06-23 19:49     ` Theodore Ts'o
  1 sibling, 1 reply; 15+ messages in thread
From: Theodore Ts'o @ 2021-06-23 13:13 UTC (permalink / raw)
  To: Wang Jianchao; +Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin

Hi Jianchao, 

FYI, this patch series has confliects with these patches, which landed
in 5.13-rc1:

196e402adf2e - ext4: improve cr 0 / cr 1 group scanning
4b68f6df1059 - ext4: add MB_NUM_ORDERS macro
a6c75eaf1103 - ext4: add mballoc stats proc file
67d251860461 - ext4: drop s_mb_bal_lock and convert protected fields to atomic

The conflicts were relatively minor, but the obvious fix-ups resulted
in a large number of crashes caused by various stress tests, such as
generic/068 and generic/204.  I'm currently investigating to see what
I might have messed up when I tried applying these patches, as well as
running your patch set applied against 5.12 to make sure the problems
weren't introduced by the patch set itself.

					- Ted

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-06-23 13:13   ` Theodore Ts'o
@ 2021-06-23 19:49     ` Theodore Ts'o
  2021-07-02 10:27       ` Josh Triplett
  2021-07-05 11:28       ` Wang Jianchao
  0 siblings, 2 replies; 15+ messages in thread
From: Theodore Ts'o @ 2021-06-23 19:49 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin, Josh Triplett

CC'ed Josh since he was testing this patchset.

On Wed, Jun 23, 2021 at 09:13:42AM -0400, Theodore Ts'o wrote:
> 
> The conflicts were relatively minor, but the obvious fix-ups resulted
> in a large number of crashes caused by various stress tests, such as
> generic/068 and generic/204.  I'm currently investigating to see what
> I might have messed up when I tried applying these patches, as well as
> running your patch set applied against 5.12 to make sure the problems
> weren't introduced by the patch set itself.

I applied this patch set on top of 5.12, where it applied w/o any
patch conflicts, and I'm still seeing large number of crashes[1].  The
crashes are in a wide variety of tests, and many of the stack traces
involve the ext4_discard_work workqueue.

[1] I ran the test via "gce-xfstests ltm -c ext4/all -g auto --kernel
gs://<bucket>/kernel.deb", but almost all of the file system configs
were crashing.  So "kvm-xfstests -c ext4/4k -g auto" should be
sufficient to reproduce the problem.  kvm-xfstests can be found at
https://github.com/tytso/xfstests-bld.

I don't have time to debug this more closely right now, but at a rough
guess is that workqueue isn't properly getting shutdown during a file
system unmount, and this is causing all sorts of mischief.

       	  	     	      	 	     - Ted


Jun 23 09:04:46 debian kernel: [    0.000000] Linux version 5.12.0-xfstests-00007-g1d72c24b1506 (tytso@cwcc) (gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2) #188 SMP Wed Jun 23 08:59:01 EDT 2021
Jun 23 09:04:46 debian kernel: [    0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.12.0-xfstests-00007-g1d72c24b1506 root=UUID=d7961c9f-8636-4194-bcb7-b7861b5a470f ro console=ttyS0,115200 earlyprintk=ttyS0,115200 elevator=noop console=ttyS0 net.ifnames=0 biosdevname=0 mem=7680M fstestcfg=ext4/4k fstestset=-g,auto fstestexc= fstestopt=aex fstesttyp=ext4 fstestapi=1.5 fsteststr= nfssrv= orig_cmdline=LS1pbnN0YW5jZS1uYW1lIHhmc3Rlc3RzLWx0bS0yMDIxMDYyMzA5MDI0Ny1hYSAtLWdjZS16b25lIHVzLXdlc3QxLWEgLS1ncy1idWNrZXQgZ2NlLXhmc3Rlc3RzIC0taW1hZ2UtcHJvamVjdCBnY2UteGZzdGVzdHMgLS1rZXJuZWwgZ3M6Ly9nY2UteGZzdGVzdHMva2VybmVsLmRlYiAtLWJ1Y2tldC1zdWJkaXIgcmVzdWx0cyAtLW5vLWVtYWlsIC1jIGV4dDQvNGsgZnVsbA==
	...
[  954.782827] run fstests generic/041 at 2021-06-23 09:20:30
Jun 23 09:20:30 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/041.
Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.288215] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.879898] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090048] EXT4-fs (dm-2): recovery complete
Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090776] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
[  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
[  985.890856] #PF: supervisor read access in kernel mode
[  985.896241] #PF: error_code(0x0000) - not-present page
[  985.901665] PGD 0 P4D 0 
[  985.904648] Oops: 0000 [#1] SMP PTI
[  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
[  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  985.927582] Workqueue: ext4discard ext4_discard_work
[  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
[  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
[  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
[  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
[  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
[  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
[  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
[  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
[  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
[  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
[  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  986.029742] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  986.037346] Call Trace:
[  986.039958]  ? lock_acquire.part.0+0x5c/0x120
[  986.044430]  ? process_one_work+0x1fb/0x590
[  986.048844]  ? lock_is_held_type+0x98/0x100
[  986.053742]  process_one_work+0x27b/0x590
[  986.058146]  worker_thread+0x4e/0x300
[  986.061953]  ? process_one_work+0x590/0x590
[  986.066255]  kthread+0x13a/0x150
[  986.069770]  ? __kthread_bind_mask+0x60/0x60
[  986.074420]  ret_from_fork+0x22/0x30
[  986.078269] CR2: 0000000000000040
[  986.081703] ---[ end trace 01e941885177e812 ]---
[  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
[  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
[  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
[  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
[  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
[  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
[  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
[  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
[  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
[  986.161707] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
[  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
[  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
[  986.208016] INFO: lockdep is turned off.
[  986.212055] irq event stamp: 559784
[  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
[  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
[  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
[  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
[  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
[  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  986.271275] Workqueue: ext4discard ext4_discard_work
[  986.276382] Call Trace:
[  986.279012]  dump_stack+0x6d/0x89
[  986.283240]  ___might_sleep.cold+0xa6/0xb6
[  986.287489]  exit_signals+0x1c/0x2c0
[  986.291417]  do_exit+0xa6/0x3c0
[  986.294672]  rewind_stack_do_exit+0x17/0x20
[  986.299092] RIP: 0000:0x0
[  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
[  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
[  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.890856] #PF: supervisor read access in kernel mode
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.896241] #PF: error_code(0x0000) - not-present page
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.901665] PGD 0 P4D 0 
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.904648] Oops: 0000 [#1] SMP PTI
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.927582] Workqueue: ext4discard ext4_discard_work
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.029742] DR3: 0000000000000000 DR6: 0000[  986.603186] run fstests generic/043 at 2021-06-23 09:21:02
0000fffe0ff0 DR7: 0000000000000400
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.037346] Call Trace:
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.039958]  ? lock_acquire.part.0+0x5c/0x120
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.044430]  ? process_one_work+0x1fb/0x590
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.048844]  ? lock_is_held_type+0x98/0x100
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.053742]  process_one_work+0x27b/0x590
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.058146]  worker_thread+0x4e/0x300
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.061953]  ? process_one_work+0x590/0x590
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.066255]  kthread+0x13a/0x150
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.069770]  ? __kthread_bind_mask+0x60/0x60
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.074420]  ret_from_fork+0x22/0x30
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.078269] CR2: 0000000000000040
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.081703] ---[ end trace 01e941885177e812 ]---
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.161707] C[  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
S:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.208016] INFO: lockdep is turned off.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.212055] irq event stamp: 559784
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.271275] Workqueue: ext4discard ext4_discard_work
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.276382] Call Trace:
Jun 23 09:21:02 xfstests-ltm-20210623090247-a[  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
a kernel: [  986.279012]  dump_stack+0x6d/0x89
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.283240]  ___might_sleep.cold+0xa6/0xb6
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.287489]  exit_signals+0x1c/0x2c0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.291417]  do_exit+0xa6/0x3c0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.294672]  rewind_stack_do_exit+0x17/0x20
Jun 23 0[  987.102134] EXT4-fs (dm-1): shut down requested (1)
9:21:02 xfstests[  987.107927] Aborting journal on device dm-1-8.
-ltm-20210623090247-aa kernel: [  986.299092] RIP: 0000:0x0
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: fstests-generic-041.scope: Succeeded.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdb.mount: Succeeded.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdb.mount: Succeeded.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/043.
Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.102134] EXT4-fs (dm-1)[  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
: shut down requested (1)
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.107927] Aborting journal on device dm-1-8.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
[  999.135257] EXT4-fs (dm-1): shut down requested (2)
[  999.140764] Aborting journal on device dm-1-8.
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.135257] EXT4-fs (dm-1): shut down requested (2)
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.140764] Aborting journal on device dm-1-8.
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
[  999.206122] EXT4-fs (dm-1): recovery complete

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-06-23 19:49     ` Theodore Ts'o
@ 2021-07-02 10:27       ` Josh Triplett
  2021-07-05 11:28       ` Wang Jianchao
  1 sibling, 0 replies; 15+ messages in thread
From: Josh Triplett @ 2021-07-02 10:27 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Wang Jianchao, Andreas Dilger, linux-ext4, linux-kernel, lishujin

On Wed, Jun 23, 2021 at 03:49:52PM -0400, Theodore Ts'o wrote:
> CC'ed Josh since he was testing this patchset.
> 
> On Wed, Jun 23, 2021 at 09:13:42AM -0400, Theodore Ts'o wrote:
> > 
> > The conflicts were relatively minor, but the obvious fix-ups resulted
> > in a large number of crashes caused by various stress tests, such as
> > generic/068 and generic/204.  I'm currently investigating to see what
> > I might have messed up when I tried applying these patches, as well as
> > running your patch set applied against 5.12 to make sure the problems
> > weren't introduced by the patch set itself.
> 
> I applied this patch set on top of 5.12, where it applied w/o any
> patch conflicts, and I'm still seeing large number of crashes[1].  The
> crashes are in a wide variety of tests, and many of the stack traces
> involve the ext4_discard_work workqueue.

I applied this on top of a96bfed64c8986d6404e553f18203cae1f5ac7e6, which
required some fixups. I didn't try to run any kind of intensive
stress-tests, but I've run the deletion performance test I mentioned, as
well as various software compiles and other miscellaneous work. I
haven't run into any apparent issues.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent
  2021-06-23 19:49     ` Theodore Ts'o
  2021-07-02 10:27       ` Josh Triplett
@ 2021-07-05 11:28       ` Wang Jianchao
  1 sibling, 0 replies; 15+ messages in thread
From: Wang Jianchao @ 2021-07-05 11:28 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andreas Dilger, linux-ext4, linux-kernel, lishujin, Josh Triplett

Hi Ted

Thanks so much for you
I've be recently busy on some other works and so sorry for missing the mails.
I will debug and rework this patch when the project on hand is finished

Best regards
Jianchao

On 2021/6/24 3:49 AM, Theodore Ts'o wrote:
> CC'ed Josh since he was testing this patchset.
> 
> On Wed, Jun 23, 2021 at 09:13:42AM -0400, Theodore Ts'o wrote:
>>
>> The conflicts were relatively minor, but the obvious fix-ups resulted
>> in a large number of crashes caused by various stress tests, such as
>> generic/068 and generic/204.  I'm currently investigating to see what
>> I might have messed up when I tried applying these patches, as well as
>> running your patch set applied against 5.12 to make sure the problems
>> weren't introduced by the patch set itself.
> 
> I applied this patch set on top of 5.12, where it applied w/o any
> patch conflicts, and I'm still seeing large number of crashes[1].  The
> crashes are in a wide variety of tests, and many of the stack traces
> involve the ext4_discard_work workqueue.
> 
> [1] I ran the test via "gce-xfstests ltm -c ext4/all -g auto --kernel
> gs://<bucket>/kernel.deb", but almost all of the file system configs
> were crashing.  So "kvm-xfstests -c ext4/4k -g auto" should be
> sufficient to reproduce the problem.  kvm-xfstests can be found at
> https://github.com/tytso/xfstests-bld.
> 
> I don't have time to debug this more closely right now, but at a rough
> guess is that workqueue isn't properly getting shutdown during a file
> system unmount, and this is causing all sorts of mischief.
> 
>        	  	     	      	 	     - Ted
> 
> 
> Jun 23 09:04:46 debian kernel: [    0.000000] Linux version 5.12.0-xfstests-00007-g1d72c24b1506 (tytso@cwcc) (gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2) #188 SMP Wed Jun 23 08:59:01 EDT 2021
> Jun 23 09:04:46 debian kernel: [    0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.12.0-xfstests-00007-g1d72c24b1506 root=UUID=d7961c9f-8636-4194-bcb7-b7861b5a470f ro console=ttyS0,115200 earlyprintk=ttyS0,115200 elevator=noop console=ttyS0 net.ifnames=0 biosdevname=0 mem=7680M fstestcfg=ext4/4k fstestset=-g,auto fstestexc= fstestopt=aex fstesttyp=ext4 fstestapi=1.5 fsteststr= nfssrv= orig_cmdline=LS1pbnN0YW5jZS1uYW1lIHhmc3Rlc3RzLWx0bS0yMDIxMDYyMzA5MDI0Ny1hYSAtLWdjZS16b25lIHVzLXdlc3QxLWEgLS1ncy1idWNrZXQgZ2NlLXhmc3Rlc3RzIC0taW1hZ2UtcHJvamVjdCBnY2UteGZzdGVzdHMgLS1rZXJuZWwgZ3M6Ly9nY2UteGZzdGVzdHMva2VybmVsLmRlYiAtLWJ1Y2tldC1zdWJkaXIgcmVzdWx0cyAtLW5vLWVtYWlsIC1jIGV4dDQvNGsgZnVsbA==
> 	...
> [  954.782827] run fstests generic/041 at 2021-06-23 09:20:30
> Jun 23 09:20:30 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/041.
> Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.288215] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:20:31 xfstests-ltm-20210623090247-aa kernel: [  955.879898] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:20:50 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090048] EXT4-fs (dm-2): recovery complete
> Jun 23 09:20:51 xfstests-ltm-20210623090247-aa kernel: [  975.090776] EXT4-fs (dm-2): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:01 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> [  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
> [  985.890856] #PF: supervisor read access in kernel mode
> [  985.896241] #PF: error_code(0x0000) - not-present page
> [  985.901665] PGD 0 P4D 0 
> [  985.904648] Oops: 0000 [#1] SMP PTI
> [  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
> [  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> [  985.927582] Workqueue: ext4discard ext4_discard_work
> [  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
> [  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> [  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> [  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> [  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> [  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> [  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> [  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> [  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> [  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> [  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  986.029742] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  986.037346] Call Trace:
> [  986.039958]  ? lock_acquire.part.0+0x5c/0x120
> [  986.044430]  ? process_one_work+0x1fb/0x590
> [  986.048844]  ? lock_is_held_type+0x98/0x100
> [  986.053742]  process_one_work+0x27b/0x590
> [  986.058146]  worker_thread+0x4e/0x300
> [  986.061953]  ? process_one_work+0x590/0x590
> [  986.066255]  kthread+0x13a/0x150
> [  986.069770]  ? __kthread_bind_mask+0x60/0x60
> [  986.074420]  ret_from_fork+0x22/0x30
> [  986.078269] CR2: 0000000000000040
> [  986.081703] ---[ end trace 01e941885177e812 ]---
> [  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
> [  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> [  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> [  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> [  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> [  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> [  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> [  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> [  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> [  986.161707] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> [  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
> [  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
> [  986.208016] INFO: lockdep is turned off.
> [  986.212055] irq event stamp: 559784
> [  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
> [  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
> [  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
> [  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
> [  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
> [  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> [  986.271275] Workqueue: ext4discard ext4_discard_work
> [  986.276382] Call Trace:
> [  986.279012]  dump_stack+0x6d/0x89
> [  986.283240]  ___might_sleep.cold+0xa6/0xb6
> [  986.287489]  exit_signals+0x1c/0x2c0
> [  986.291417]  do_exit+0xa6/0x3c0
> [  986.294672]  rewind_stack_do_exit+0x17/0x20
> [  986.299092] RIP: 0000:0x0
> [  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
> [  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
> [  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> [  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
> [  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> [  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.883114] BUG: kernel NULL pointer dereference, address: 0000000000000040
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.890856] #PF: supervisor read access in kernel mode
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.896241] #PF: error_code(0x0000) - not-present page
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.901665] PGD 0 P4D 0 
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.904648] Oops: 0000 [#1] SMP PTI
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.908265] CPU: 0 PID: 5738 Comm: kworker/u4:2 Not tainted 5.12.0-xfstests-00007-g1d72c24b1506 #188
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.917880] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.927582] Workqueue: ext4discard ext4_discard_work
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.932662] RIP: 0010:ext4_discard_work+0x31/0x3f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.937653] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.956879] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.962235] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.969496] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.976997] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.985066] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.992332] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  985.999966] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.008242] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.014229] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.021660] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.029742] DR3: 0000000000000000 DR6: 0000[  986.603186] run fstests generic/043 at 2021-06-23 09:21:02
> 0000fffe0ff0 DR7: 0000000000000400
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.037346] Call Trace:
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.039958]  ? lock_acquire.part.0+0x5c/0x120
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.044430]  ? process_one_work+0x1fb/0x590
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.048844]  ? lock_is_held_type+0x98/0x100
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.053742]  process_one_work+0x27b/0x590
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.058146]  worker_thread+0x4e/0x300
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.061953]  ? process_one_work+0x590/0x590
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.066255]  kthread+0x13a/0x150
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.069770]  ? __kthread_bind_mask+0x60/0x60
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.074420]  ret_from_fork+0x22/0x30
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.078269] CR2: 0000000000000040
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.081703] ---[ end trace 01e941885177e812 ]---
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.086543] RIP: 0010:ext4_discard_work+0x31/0x3f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.091558] Code: 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b af 88 fe ff ff 65 48 8b 04 25 28 00 00 00 48 89 44 24 48 31 c0 48 8b 85 e8 06 00 00 <8b> 50 40 85 d2 48 89 14 24 0f 84 82 02 00 00 31 db eb 2c 41 89 d9
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.110810] RSP: 0018:ffffa527052c7dd8 EFLAGS: 00010246
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.116344] RAX: 0000000000000000 RBX: ffff8db25dd98000 RCX: 0000000000000001
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.123850] RDX: 0000000000000000 RSI: ffffffffa0adde80 RDI: ffff8db25fff3510
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.131096] RBP: ffff8db25fc2f000 R08: 0000000000000000 R09: 0000000000040479
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.138443] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db24011b800
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.145968] R13: ffff8db240822500 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.153412] FS:  0000000000000000(0000) GS:ffff8db319200000(0000) knlGS:0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.161707] C[  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> S:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.167563] CR2: 0000000000000040 CR3: 000000002c212006 CR4: 00000000003706f0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.175089] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.182509] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.190000] BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.199449] in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 5738, name: kworker/u4:2
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.208016] INFO: lockdep is turned off.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.212055] irq event stamp: 559784
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.215784] hardirqs last  enabled at (559783): [<ffffffff9ff78ec4>] _raw_spin_unlock_irq+0x24/0x30
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.225129] hardirqs last disabled at (559784): [<ffffffff9ff69405>] exc_page_fault+0x35/0x240
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.233856] softirqs last  enabled at (559776): [<ffffffff9f50b8cd>] wb_workfn+0x18d/0x350
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.242258] softirqs last disabled at (559772): [<ffffffff9f45cd8e>] wb_wakeup_delayed+0x2e/0x70
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.251281] CPU: 0 PID: 5738 Comm: kworker/u4:2 Tainted: G      D           5.12.0-xfstests-00007-g1d72c24b1506 #188
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.261926] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.271275] Workqueue: ext4discard ext4_discard_work
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.276382] Call Trace:
> Jun 23 09:21:02 xfstests-ltm-20210623090247-a[  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> a kernel: [  986.279012]  dump_stack+0x6d/0x89
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.283240]  ___might_sleep.cold+0xa6/0xb6
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.287489]  exit_signals+0x1c/0x2c0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.291417]  do_exit+0xa6/0x3c0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.294672]  rewind_stack_do_exit+0x17/0x20
> Jun 23 0[  987.102134] EXT4-fs (dm-1): shut down requested (1)
> 9:21:02 xfstests[  987.107927] Aborting journal on device dm-1-8.
> -ltm-20210623090247-aa kernel: [  986.299092] RIP: 0000:0x0
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.301940] Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.309322] RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.317124] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.324586] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.331945] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.339473] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.346718] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: fstests-generic-041.scope: Succeeded.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdb.mount: Succeeded.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdb.mount: Succeeded.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa systemd[1]: Started /usr/bin/bash -c test -w /proc/self/oom_score_adj && echo 250 > /proc/self/oom_score_adj; exec ./tests/generic/043.
> Jun 23 09:21:02 xfstests-ltm-20210623090247-aa kernel: [  986.843267] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.049261] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.102134] EXT4-fs (dm-1)[  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> : shut down requested (1)
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.107927] Aborting journal on device dm-1-8.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:03 xfstests-ltm-20210623090247-aa kernel: [  987.284882] EXT4-fs (dm-1): mounted filesystem with ordered data mode. Opts: acl,user_xattr,block_validity. Quota mode: none.
> [  999.135257] EXT4-fs (dm-1): shut down requested (2)
> [  999.140764] Aborting journal on device dm-1-8.
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.135257] EXT4-fs (dm-1): shut down requested (2)
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa kernel: [  999.140764] Aborting journal on device dm-1-8.
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[1]: xt\x2dvdc.mount: Succeeded.
> Jun 23 09:21:15 xfstests-ltm-20210623090247-aa systemd[921]: xt\x2dvdc.mount: Succeeded.
> [  999.206122] EXT4-fs (dm-1): recovery complete
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex
  2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
  2021-06-22  0:55             ` Josh Triplett
@ 2023-09-06  0:11             ` Sarthak Kukreti
  1 sibling, 0 replies; 15+ messages in thread
From: Sarthak Kukreti @ 2023-09-06  0:11 UTC (permalink / raw)
  To: Wang Jianchao
  Cc: Theodore Ts'o, Andreas Dilger, linux-ext4, linux-kernel, lishujin

Hi,

I noticed a "regression" with this patch series on ChromiumOS: on a
ext4 filesystem (mounted with -o discard) on a thinly provisioned
block device (dm-thin/loopback on a sparse file), deletion of a large
file (O(10GB) or more) immediately followed by an unmount results in
the discards to the file's extents getting dropped with umount(). This
prevents space from getting reclaimed by the underlying
thin-provisioning layer (dm-thinpool/lower filesystem), even on
subsequent mounts of the filesystem (eg. after a reboot).

Currently, the only way to 'reclaim' this space into a thinpool is to
run fstrim on dm-thin filesystems on every mount: should the dropped
discard requests for newly freed blocks be requeued on subsequent
mounts of the filesystem?

Best
Sarthak

On Wed, May 26, 2021 at 1:44 AM Wang Jianchao <jianchao.wan9@gmail.com> wrote:
>
> Right now, discard is issued and waited to be completed in jbd2
> commit kthread context after the logs are committed. When large
> amount of files are deleted and discard is flooding, jbd2 commit
> kthread can be blocked for long time. Then all of the metadata
> operations can be blocked to wait the log space.
>
> One case is the page fault path with read mm->mmap_sem held, which
> wants to update the file time but has to wait for the log space.
> When other threads in the task wants to do mmap, then write mmap_sem
> is blocked. Finally all of the following read mmap_sem requirements
> are blocked, even the ps command which need to read the /proc/pid/
> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
> used to be blocked for 5 mins.
>
> This patch frees the blocks back to buddy after commit and then do
> discard in a async kworker context in fstrim fashion, namely,
>  - mark blocks to be discarded as used if they have not been allocated
>  - do discard
>  - mark them free
> After this, jbd2 commit kthread won't be blocked any more by discard
> and we won't get NOSPC even if the discard is slow or throttled.
>
> Link: https://marc.info/?l=linux-kernel&m=162143690731901&w=2
> Suggested-by: Theodore Ts'o <tytso@mit.edu>
> Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
> ---
>  fs/ext4/ext4.h    |   2 +
>  fs/ext4/mballoc.c | 162 +++++++++++++++++++++++++++++++++---------------------
>  fs/ext4/mballoc.h |   3 -
>  3 files changed, 101 insertions(+), 66 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 5c5c8e4..2e48c88 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1526,6 +1526,7 @@ struct ext4_sb_info {
>         unsigned int *s_mb_maxs;
>         unsigned int s_group_info_size;
>         atomic_t s_mb_free_pending;
> +       struct work_struct s_discard_work;
>
>         /* tunables */
>         unsigned long s_stripe;
> @@ -3306,6 +3307,7 @@ struct ext4_group_info {
>         unsigned long   bb_check_counter;
>  #endif
>         struct rb_root  bb_free_root;
> +       struct rb_root  bb_discard_root;
>         ext4_grpblk_t   bb_first_free;  /* first free block */
>         ext4_grpblk_t   bb_free;        /* total free blocks */
>         ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 15715e7..feea439 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -332,6 +332,7 @@
>  static struct kmem_cache *ext4_pspace_cachep;
>  static struct kmem_cache *ext4_ac_cachep;
>  static struct kmem_cache *ext4_free_data_cachep;
> +static struct workqueue_struct *ext4_discard_wq;
>
>  /* We create slab caches for groupinfo data structures based on the
>   * superblock block size.  There will be one per mounted filesystem for
> @@ -355,7 +356,10 @@ static inline struct ext4_free_data *efd_entry(struct rb_node *n)
>         return rb_entry_safe(n, struct ext4_free_data, efd_node);
>  }
>  static int ext4_insert_free_data(struct rb_root *root, struct ext4_free_data *nfd);
> -
> +static int ext4_try_to_trim_range(struct super_block *sb,
> +               struct ext4_buddy *e4b, ext4_grpblk_t start,
> +               ext4_grpblk_t max, ext4_grpblk_t minblocks);
> +static void ext4_discard_work(struct work_struct *work);
>  /*
>   * The algorithm using this percpu seq counter goes below:
>   * 1. We sample the percpu discard_pa_seq counter before trying for block
> @@ -2657,6 +2661,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
>         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
>         init_rwsem(&meta_group_info[i]->alloc_sem);
>         meta_group_info[i]->bb_free_root = RB_ROOT;
> +       meta_group_info[i]->bb_discard_root = RB_ROOT;
>         meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
>
>         mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
> @@ -2857,6 +2862,7 @@ int ext4_mb_init(struct super_block *sb)
>         spin_lock_init(&sbi->s_md_lock);
>         spin_lock_init(&sbi->s_bal_lock);
>         atomic_set(&sbi->s_mb_free_pending, 0);
> +       INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
>
>         sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
>         sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
> @@ -2949,6 +2955,15 @@ int ext4_mb_release(struct super_block *sb)
>         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
>         int count;
>
> +       if (test_opt(sb, DISCARD)) {
> +               /*
> +                * wait the discard work to drain all of ext4_free_data
> +                */
> +               queue_work(ext4_discard_wq, &sbi->s_discard_work);
> +               flush_work(&sbi->s_discard_work);
> +       }
> +
> +       queue_work(ext4_discard_wq, &sbi->s_discard_work);
>         if (sbi->s_group_info) {
>                 for (i = 0; i < ngroups; i++) {
>                         cond_resched();
> @@ -3024,30 +3039,77 @@ static inline int ext4_issue_discard(struct super_block *sb,
>                 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
>  }
>
> -static void ext4_free_data_in_buddy(struct super_block *sb,
> -                                   struct ext4_free_data *entry)
> +static void ext4_discard_work(struct work_struct *work)
>  {
> +       struct ext4_sb_info *sbi = container_of(work,
> +                       struct ext4_sb_info, s_discard_work);
> +       struct super_block *sb = sbi->s_sb;
> +       ext4_group_t ngroups = ext4_get_groups_count(sb);
> +       struct ext4_group_info *grp;
> +       struct ext4_free_data *fd, *nfd;
>         struct ext4_buddy e4b;
> -       struct ext4_group_info *db;
> -       int err, count = 0, count2 = 0;
> +       int i, err;
> +
> +       for (i = 0; i < ngroups; i++) {
> +               grp = ext4_get_group_info(sb, i);
> +               if (RB_EMPTY_ROOT(&grp->bb_discard_root))
> +                       continue;
>
> -       mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
> -                entry->efd_count, entry->efd_group, entry);
> +               err = ext4_mb_load_buddy(sb, i, &e4b);
> +               if (err) {
> +                       ext4_warning(sb, "Error %d loading buddy information for %u",
> +                                       err, i);
> +                       continue;
> +               }
>
> -       err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
> -       /* we expect to find existing buddy because it's pinned */
> -       BUG_ON(err != 0);
> +               ext4_lock_group(sb, i);
> +               rbtree_postorder_for_each_entry_safe(fd, nfd,
> +                               &grp->bb_discard_root, efd_node) {
> +                       rb_erase(&fd->efd_node, &grp->bb_discard_root);
> +                       /*
> +                        * If filesystem is umounting, give up the discard
> +                        */
> +                       if (sb->s_flags & SB_ACTIVE)
> +                               ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
> +                                               fd->efd_start_cluster + fd->efd_count - 1, 1);
> +                       kmem_cache_free(ext4_free_data_cachep, fd);
> +               }
> +               ext4_unlock_group(sb, i);
> +               ext4_mb_unload_buddy(&e4b);
> +       }
> +}
>
> -       atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
> +static int ext4_free_data_in_buddy(struct super_block *sb,
> +               ext4_group_t group, tid_t commit_tid)
> +{
> +       struct ext4_buddy e4b;
> +       struct ext4_group_info *db;
> +       struct ext4_free_data *fd, *nfd;
> +       int cnt, nr_fd;
>
> +       /* we expect to find existing buddy because it's pinned */
> +       BUG_ON(ext4_mb_load_buddy(sb, group, &e4b));
>         db = e4b.bd_info;
> -       /* there are blocks to put in buddy to make them really free */
> -       count += entry->efd_count;
> -       count2++;
> -       ext4_lock_group(sb, entry->efd_group);
> -       /* Take it out of per group rb tree */
> -       rb_erase(&entry->efd_node, &(db->bb_free_root));
> -       mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
> +       cnt = 0;
> +       nr_fd = 0;
> +       ext4_lock_group(sb, group);
> +       rbtree_postorder_for_each_entry_safe(fd, nfd,
> +                       &db->bb_free_root, efd_node) {
> +               if (fd->efd_tid != commit_tid)
> +                       continue;
> +
> +               mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
> +                        fd->efd_count, fd->efd_group, fd);
> +               atomic_sub(fd->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
> +               cnt += fd->efd_count;
> +               nr_fd++;
> +               rb_erase(&fd->efd_node, &db->bb_free_root);
> +               mb_free_blocks(NULL, &e4b, fd->efd_start_cluster, fd->efd_count);
> +               if (test_opt(sb, DISCARD))
> +                       ext4_insert_free_data(&db->bb_discard_root, fd);
> +               else
> +                       kmem_cache_free(ext4_free_data_cachep, fd);
> +       }
>
>         /*
>          * Clear the trimmed flag for the group so that the next
> @@ -3055,22 +3117,22 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
>          * If the volume is mounted with -o discard, online discard
>          * is supported and the free blocks will be trimmed online.
>          */
> -       if (!test_opt(sb, DISCARD))
> +       if (!test_opt(sb, DISCARD) && cnt)
>                 EXT4_MB_GRP_CLEAR_TRIMMED(db);
>
> -       if (!db->bb_free_root.rb_node) {
> +       if (RB_EMPTY_ROOT(&db->bb_free_root) && cnt) {
>                 /* No more items in the per group rb tree
>                  * balance refcounts from ext4_mb_free_metadata()
>                  */
>                 put_page(e4b.bd_buddy_page);
>                 put_page(e4b.bd_bitmap_page);
>         }
> -       ext4_unlock_group(sb, entry->efd_group);
> -       kmem_cache_free(ext4_free_data_cachep, entry);
> +       ext4_unlock_group(sb, group);
>         ext4_mb_unload_buddy(&e4b);
>
> -       mb_debug(sb, "freed %d blocks in %d structures\n", count,
> -                count2);
> +       mb_debug(sb, "freed %d blocks in %d structures\n", cnt, nr_fd);
> +
> +       return nr_fd;
>  }
>
>  /*
> @@ -3081,52 +3143,21 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
>  {
>         struct ext4_sb_info *sbi = EXT4_SB(sb);
>         ext4_group_t ngroups = ext4_get_groups_count(sb);
> -       struct ext4_free_data *fd, *nfd;
>         struct ext4_group_info *grp;
> -       struct bio *discard_bio = NULL;
> -       struct list_head freed_data_list;
> -       int err, i;
> +       int i, nr;
>
>         if (!atomic_read(&sbi->s_mb_free_pending))
>                 return;
>
> -       INIT_LIST_HEAD(&freed_data_list);
> -       for (i = 0; i < ngroups; i++) {
> +       for (i = 0, nr = 0; i < ngroups; i++) {
>                 grp = ext4_get_group_info(sb, i);
> -               ext4_lock_group(sb, i);
> -               rbtree_postorder_for_each_entry_safe(fd, nfd,
> -                               &grp->bb_free_root, efd_node) {
> -                       if (fd->efd_tid != commit_tid)
> -                               continue;
> -                       INIT_LIST_HEAD(&fd->efd_list);
> -                       list_add_tail(&fd->efd_list, &freed_data_list);
> -               }
> -               ext4_unlock_group(sb, i);
> -       }
> -
> -       if (test_opt(sb, DISCARD)) {
> -               list_for_each_entry(fd, &freed_data_list, efd_list) {
> -                       err = ext4_issue_discard(sb, fd->efd_group,
> -                                                fd->efd_start_cluster,
> -                                                fd->efd_count,
> -                                                &discard_bio);
> -                       if (err && err != -EOPNOTSUPP) {
> -                               ext4_msg(sb, KERN_WARNING, "discard request in"
> -                                        " group:%d block:%d count:%d failed"
> -                                        " with %d", fd->efd_group,
> -                                        fd->efd_start_cluster, fd->efd_count, err);
> -                       } else if (err == -EOPNOTSUPP)
> -                               break;
> -               }
> -
> -               if (discard_bio) {
> -                       submit_bio_wait(discard_bio);
> -                       bio_put(discard_bio);
> -               }
> +               if (RB_EMPTY_ROOT(&grp->bb_free_root))
> +                       continue;
> +               nr += ext4_free_data_in_buddy(sb, i, commit_tid);
>         }
>
> -       list_for_each_entry_safe(fd, nfd, &freed_data_list, efd_list)
> -               ext4_free_data_in_buddy(sb, fd);
> +       if (nr && test_opt(sb, DISCARD))
> +               queue_work(ext4_discard_wq, &sbi->s_discard_work);
>  }
>
>  int __init ext4_init_mballoc(void)
> @@ -3146,8 +3177,13 @@ int __init ext4_init_mballoc(void)
>         if (ext4_free_data_cachep == NULL)
>                 goto out_ac_free;
>
> -       return 0;
> +       ext4_discard_wq = alloc_workqueue("ext4discard", WQ_UNBOUND, 0);
> +       if (!ext4_discard_wq)
> +               goto out_free_data;
>
> +       return 0;
> +out_free_data:
> +       kmem_cache_destroy(ext4_free_data_cachep);
>  out_ac_free:
>         kmem_cache_destroy(ext4_ac_cachep);
>  out_pa_free:
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index e75b474..d76286b 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -79,9 +79,6 @@
>  #define MB_DEFAULT_MAX_INODE_PREALLOC  512
>
>  struct ext4_free_data {
> -       /* this links the free block information from sb_info */
> -       struct list_head                efd_list;
> -
>         /* this links the free block information from group_info */
>         struct rb_node                  efd_node;
>
> --
> 1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-09-06  0:11 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <164ffa3b-c4d5-6967-feba-b972995a6dfb@gmail.com>
2021-05-26  8:42 ` [PATCH V2 1/7] ext4: remove the 'group' parameter of ext4_trim_extent Wang Jianchao
2021-05-27 19:47   ` Andreas Dilger
2021-06-23 13:13   ` Theodore Ts'o
2021-06-23 19:49     ` Theodore Ts'o
2021-07-02 10:27       ` Josh Triplett
2021-07-05 11:28       ` Wang Jianchao
     [not found] ` <a602a6ba-2073-8384-4c8f-d669ee25c065@gmail.com>
2021-05-26  8:43   ` [PATCH V2 2/7] ext4: add new helper interface ext4_try_to_trim_range() Wang Jianchao
     [not found]   ` <49382052-6238-f1fb-40d1-b6b801b39ff7@gmail.com>
2021-05-26  8:43     ` [PATCH V2 3/7] ext4: remove the repeated comment of ext4_trim_all_free Wang Jianchao
     [not found]     ` <48e33dea-d15e-f211-0191-e01bd3eb17b3@gmail.com>
2021-05-26  8:43       ` [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data Wang Jianchao
     [not found]       ` <67eeb65a-d413-c4f9-c06f-d5dcceca0e4f@gmail.com>
2021-05-26  8:43         ` [PATCH V2 5/7] ext4: get buddy cache after insert successfully Wang Jianchao
2021-06-23  3:06           ` Theodore Ts'o
     [not found]         ` <0b7915bc-193a-137b-4e52-8aaef8d6fef3@gmail.com>
2021-05-26  8:43           ` [PATCH V2 6/7] ext4: use bb_free_root to get the free data entry Wang Jianchao
2021-05-26  8:44           ` [PATCH V2 7/7] ext4: get discard out of jbd2 commit kthread contex Wang Jianchao
2021-06-22  0:55             ` Josh Triplett
2023-09-06  0:11             ` Sarthak Kukreti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).