linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ext4: get discard out of jbd2 commit kthread
@ 2021-05-17  3:57 Wang Jianchao
  2021-05-17 20:52 ` Theodore Y. Ts'o
  0 siblings, 1 reply; 7+ messages in thread
From: Wang Jianchao @ 2021-05-17  3:57 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel

Right now, discard is issued and waited to be completed in jbd2
commit kthread context after the logs are committed. When large
amount of files are deleted and discard is flooding, jbd2 commit
kthread can be blocked for long time. Then all of the metadata
operations can be blocked to wait the log space.

One case is the page fault path with read mm->mmap_sem held, which
wants to update the file time but has to wait for the log space.
When other threads in the task wants to do mmap, then write mmap_sem
is blocked. Finally all of the following read mmap_sem requirements
are blocked, even the ps command which need to read the /proc/pid/
-cmdline. Our monitor service which needs to read /proc/pid/cmdline
used to be blocked for 5 mins.

This patch moves the discard out of jbd2 kthread context and do it
in kworker. And drain the kwork when cannot get space in mb buddy.
This is done out of jbd2 handle and won't block the commit process.
After that, we could use blk-wbt or other method to throttle the
discard and needn't to worry it block the jbd2 commit kthread any
more.

Signed-off-by: Wang Jianchao <wangjianchao@kuaishou.com>
---
 fs/ext4/balloc.c  |   1 +
 fs/ext4/ext4.h    |   4 ++
 fs/ext4/mballoc.c | 130 ++++++++++++++++++++++++++++++++++++++++++------------
 fs/ext4/mballoc.h |   5 +++
 fs/ext4/super.c   |   3 ++
 5 files changed, 116 insertions(+), 27 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9dc6e74b..e8676a8 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -661,6 +661,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	 */
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
 	(void) jbd2_journal_force_commit_nested(sbi->s_journal);
+	flush_work(&sbi->s_afd_work);
 	return 1;
 }
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 37002663..5c4d4bf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1531,6 +1531,10 @@ struct ext4_sb_info {
 	struct list_head *s_mb_largest_free_orders;
 	rwlock_t *s_mb_largest_free_orders_locks;
 
+	struct list_head s_afd_list;
+	spinlock_t s_afd_lock;
+	struct work_struct s_afd_work;
+
 	/* tunables */
 	unsigned long s_stripe;
 	unsigned int s_mb_max_linear_groups;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3239e66..b463daa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -386,6 +386,7 @@
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_data_cachep;
+static struct workqueue_struct *ext4_discard_wq;
 
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
@@ -408,6 +409,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
 			       ext4_group_t group, int cr);
 
+static void ext4_async_discard_work(struct work_struct *work);
 /*
  * The algorithm using this percpu seq counter goes below:
  * 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -3375,6 +3377,9 @@ int ext4_mb_init(struct super_block *sb)
 	spin_lock_init(&sbi->s_md_lock);
 	sbi->s_mb_free_pending = 0;
 	INIT_LIST_HEAD(&sbi->s_freed_data_list);
+	INIT_LIST_HEAD(&sbi->s_afd_list);
+	spin_lock_init(&sbi->s_afd_lock);
+	INIT_WORK(&sbi->s_afd_work, ext4_async_discard_work);
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
@@ -3602,6 +3607,94 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 		 count2);
 }
 
+static void do_discard_and_free_data(struct super_block *sb,
+		struct list_head *free_list)
+{
+	struct ext4_free_data *entry, *nxt;
+	struct bio *discard_bio = NULL;
+	struct blk_plug plug;
+	int err;
+
+	if (!test_opt(sb, DISCARD))
+		goto free_data;
+
+	blk_start_plug(&plug);
+	list_for_each_entry(entry, free_list, efd_list) {
+		err = ext4_issue_discard(sb, entry->efd_group,
+				entry->efd_start_cluster,
+				entry->efd_count,
+				&discard_bio);
+		if (err && err != -EOPNOTSUPP)
+			ext4_msg(sb, KERN_WARNING,
+				"discard request in group:%d block:%d count:%d failed with %d",
+				entry->efd_group,
+				entry->efd_start_cluster,
+				entry->efd_count, err);
+		else if (err == -EOPNOTSUPP)
+			break;
+	}
+	blk_finish_plug(&plug);
+
+	if (discard_bio) {
+		submit_bio_wait(discard_bio);
+		bio_put(discard_bio);
+	}
+
+free_data:
+	list_for_each_entry_safe(entry, nxt, free_list, efd_list)
+		ext4_free_data_in_buddy(sb, entry);
+}
+
+static void ext4_async_discard_work(struct work_struct *work)
+{
+	struct ext4_sb_info *sbi = container_of(work,
+			struct ext4_sb_info, s_afd_work);
+	struct ext4_async_free_data *afd, *nxt;
+	LIST_HEAD(afd_list);
+
+	/*
+	 * Don't worry about the lifecycle of sb, kill_sb would
+	 * invoke sync_fs who waits to drain all of async discard
+	 * work.
+	 */
+	spin_lock(&sbi->s_afd_lock);
+	list_splice_init(&sbi->s_afd_list, &afd_list);
+	spin_unlock(&sbi->s_afd_lock);
+
+	list_for_each_entry_safe(afd, nxt, &afd_list, list_node) {
+		list_del(&afd->list_node);
+		do_discard_and_free_data(sbi->s_sb, &afd->free_list);
+		kfree(afd);
+	}
+}
+
+/*
+ * Try to do discard out of jbd2 kthread, otherwise it may
+ * block the process of transaction commit.
+ */
+static bool ext4_do_async_discard(struct super_block *sb,
+		struct list_head *free_list)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_async_free_data *afd;
+
+	afd = kmalloc(sizeof(*afd), GFP_NOFS);
+	if (!afd)
+		return false;
+
+	INIT_LIST_HEAD(&afd->list_node);
+	INIT_LIST_HEAD(&afd->free_list);
+
+	list_splice(free_list, &afd->free_list);
+	spin_lock(&sbi->s_afd_lock);
+	if (list_empty(&sbi->s_afd_list))
+		queue_work(ext4_discard_wq, &sbi->s_afd_work);
+	list_add_tail(&afd->list_node, &sbi->s_afd_list);
+	spin_unlock(&sbi->s_afd_lock);
+
+	return true;
+}
+
 /*
  * This function is called by the jbd2 layer once the commit has finished,
  * so we know we can free the blocks that were released with that commit.
@@ -3609,11 +3702,9 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
 void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_data *entry, *tmp;
-	struct bio *discard_bio = NULL;
+	struct ext4_free_data *entry;
 	struct list_head freed_data_list;
 	struct list_head *cut_pos = NULL;
-	int err;
 
 	INIT_LIST_HEAD(&freed_data_list);
 
@@ -3628,30 +3719,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
 				  cut_pos);
 	spin_unlock(&sbi->s_md_lock);
 
-	if (test_opt(sb, DISCARD)) {
-		list_for_each_entry(entry, &freed_data_list, efd_list) {
-			err = ext4_issue_discard(sb, entry->efd_group,
-						 entry->efd_start_cluster,
-						 entry->efd_count,
-						 &discard_bio);
-			if (err && err != -EOPNOTSUPP) {
-				ext4_msg(sb, KERN_WARNING, "discard request in"
-					 " group:%d block:%d count:%d failed"
-					 " with %d", entry->efd_group,
-					 entry->efd_start_cluster,
-					 entry->efd_count, err);
-			} else if (err == -EOPNOTSUPP)
-				break;
-		}
-
-		if (discard_bio) {
-			submit_bio_wait(discard_bio);
-			bio_put(discard_bio);
-		}
-	}
-
-	list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
-		ext4_free_data_in_buddy(sb, entry);
+	if (!test_opt(sb, DISCARD) ||
+	    !ext4_do_async_discard(sb, &freed_data_list))
+		do_discard_and_free_data(sb, &freed_data_list);
 }
 
 int __init ext4_init_mballoc(void)
@@ -3671,8 +3741,14 @@ int __init ext4_init_mballoc(void)
 	if (ext4_free_data_cachep == NULL)
 		goto out_ac_free;
 
+	ext4_discard_wq = alloc_workqueue("ext4discard", WQ_UNBOUND, 0);
+	if (!ext4_discard_wq)
+		goto out_free_data;
+
 	return 0;
 
+out_free_data:
+	kmem_cache_destroy(ext4_free_data_cachep);
 out_ac_free:
 	kmem_cache_destroy(ext4_ac_cachep);
 out_pa_free:
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 39da92c..b762727 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -212,6 +212,11 @@ struct ext4_buddy {
 	ext4_group_t bd_group;
 };
 
+struct ext4_async_free_data {
+	struct list_head list_node;
+	struct list_head free_list;
+};
+
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7dc94f3..545a02c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5752,6 +5752,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 			ret = err;
 	}
 
+	if (test_opt(sb, DISCARD))
+		flush_work(&sbi->s_afd_work);
+
 	return ret;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] ext4: get discard out of jbd2 commit kthread
  2021-05-17  3:57 [PATCH] ext4: get discard out of jbd2 commit kthread Wang Jianchao
@ 2021-05-17 20:52 ` Theodore Y. Ts'o
  2021-05-18  1:19   ` Wang Jianchao
  0 siblings, 1 reply; 7+ messages in thread
From: Theodore Y. Ts'o @ 2021-05-17 20:52 UTC (permalink / raw)
  To: Wang Jianchao; +Cc: Andreas Dilger, linux-ext4, linux-kernel

On Mon, May 17, 2021 at 11:57:09AM +0800, Wang Jianchao wrote:
> Right now, discard is issued and waited to be completed in jbd2
> commit kthread context after the logs are committed. When large
> amount of files are deleted and discard is flooding, jbd2 commit
> kthread can be blocked for long time. Then all of the metadata
> operations can be blocked to wait the log space.
> 
> One case is the page fault path with read mm->mmap_sem held, which
> wants to update the file time but has to wait for the log space.
> When other threads in the task wants to do mmap, then write mmap_sem
> is blocked. Finally all of the following read mmap_sem requirements
> are blocked, even the ps command which need to read the /proc/pid/
> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
> used to be blocked for 5 mins.
> 
> This patch moves the discard out of jbd2 kthread context and do it
> in kworker. And drain the kwork when cannot get space in mb buddy.
> This is done out of jbd2 handle and won't block the commit process.
> After that, we could use blk-wbt or other method to throttle the
> discard and needn't to worry it block the jbd2 commit kthread any
> more.

Wouldn't be much simpler to do something like this?

		if (discard_bio) {
-			submit_bio_wait(discard_bio);
-			bio_put(discard_bio);
+			submit_bio(discard_bio);
		}


We're throwing away the return value from submit_bio_wait(), so
there's no real need to wait for I/O to complete so we can fetch the
I/O status.

That way we don't need to move all of this to a kworker context.

     	    	       	       	   - Ted

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] ext4: get discard out of jbd2 commit kthread
  2021-05-17 20:52 ` Theodore Y. Ts'o
@ 2021-05-18  1:19   ` Wang Jianchao
  2021-05-18 14:57     ` Theodore Y. Ts'o
  0 siblings, 1 reply; 7+ messages in thread
From: Wang Jianchao @ 2021-05-18  1:19 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: Andreas Dilger, linux-ext4, linux-kernel



On 2021/5/18 4:52 AM, Theodore Y. Ts'o wrote:
> On Mon, May 17, 2021 at 11:57:09AM +0800, Wang Jianchao wrote:
>> Right now, discard is issued and waited to be completed in jbd2
>> commit kthread context after the logs are committed. When large
>> amount of files are deleted and discard is flooding, jbd2 commit
>> kthread can be blocked for long time. Then all of the metadata
>> operations can be blocked to wait the log space.
>>
>> One case is the page fault path with read mm->mmap_sem held, which
>> wants to update the file time but has to wait for the log space.
>> When other threads in the task wants to do mmap, then write mmap_sem
>> is blocked. Finally all of the following read mmap_sem requirements
>> are blocked, even the ps command which need to read the /proc/pid/
>> -cmdline. Our monitor service which needs to read /proc/pid/cmdline
>> used to be blocked for 5 mins.
>>
>> This patch moves the discard out of jbd2 kthread context and do it
>> in kworker. And drain the kwork when cannot get space in mb buddy.
>> This is done out of jbd2 handle and won't block the commit process.
>> After that, we could use blk-wbt or other method to throttle the
>> discard and needn't to worry it block the jbd2 commit kthread any
>> more.
> 
> Wouldn't be much simpler to do something like this?
> 
> 		if (discard_bio) {
> -			submit_bio_wait(discard_bio);
> -			bio_put(discard_bio);
> +			submit_bio(discard_bio);
> 		}
> 
> 
> We're throwing away the return value from submit_bio_wait(), so
> there's no real need to wait for I/O to complete so we can fetch the
> I/O status.
> 
> That way we don't need to move all of this to a kworker context.

The submit_bio also needs to be out of jbd2 commit kthread as it may be
blocked due to blk-wbt or no enough request tag. ;)

Best Regards
Jianchao


> 
>      	    	       	       	   - Ted
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] ext4: get discard out of jbd2 commit kthread
  2021-05-18  1:19   ` Wang Jianchao
@ 2021-05-18 14:57     ` Theodore Y. Ts'o
  2021-05-19  1:27       ` Wang Jianchao
  0 siblings, 1 reply; 7+ messages in thread
From: Theodore Y. Ts'o @ 2021-05-18 14:57 UTC (permalink / raw)
  To: Wang Jianchao; +Cc: Andreas Dilger, linux-ext4, linux-kernel

On Tue, May 18, 2021 at 09:19:13AM +0800, Wang Jianchao wrote:
> > That way we don't need to move all of this to a kworker context.
> 
> The submit_bio also needs to be out of jbd2 commit kthread as it may be
> blocked due to blk-wbt or no enough request tag. ;)

Actually, there's a bigger deal that I hadn't realized, about why we
is why are currently using submit_bio_wait().  We *must* wait until
discard has completed before we call ext4_free_data_in_buddy(), which
is what allows those blocks to be reused by the block allocator.

If the discard happens after we reallocate the block, there is a good
chance that we will end up corrupting a data or metadata block,
leading to user data loss.

There's another corollary to this; if you use blk-wbt, and you are
doing lots of deletes, and we move this all to a writeback thread,
this *significantly* increases the chance that the user will see
ENOSPC errors in the case where they are with a very full (close to
100% used) file system.

I'd argue that this is a *really* good reason why using mount -o
discard is Just A Bad Idea if you are running with blk-wbt.  If
discards are slow, using fstrim is a much better choice.  It's also
the case that for most SSD's and workloads, doing frequent discards
doesn't actually help that much.  The write endurance of the device is
not compromised that much if you only run fs-trim and discard unused
blocks once a day, or even once a week --- I only recommend use of
mount -o discard in cases where the discard operation is effectively
free.  (e.g., in cases where the FTL is implemented on the Host OS, or
you are running with super-fast flash which is PCIe or NVMe attached.)

Cheers,

					- Ted

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] ext4: get discard out of jbd2 commit kthread
  2021-05-18 14:57     ` Theodore Y. Ts'o
@ 2021-05-19  1:27       ` Wang Jianchao
  2021-05-19 15:08         ` Theodore Y. Ts'o
  0 siblings, 1 reply; 7+ messages in thread
From: Wang Jianchao @ 2021-05-19  1:27 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: Andreas Dilger, linux-ext4, linux-kernel



On 2021/5/18 10:57 PM, Theodore Y. Ts'o wrote:
> On Tue, May 18, 2021 at 09:19:13AM +0800, Wang Jianchao wrote:
>>> That way we don't need to move all of this to a kworker context.
>>
>> The submit_bio also needs to be out of jbd2 commit kthread as it may be
>> blocked due to blk-wbt or no enough request tag. ;)
> 
> Actually, there's a bigger deal that I hadn't realized, about why we
> is why are currently using submit_bio_wait().  We *must* wait until
> discard has completed before we call ext4_free_data_in_buddy(), which
> is what allows those blocks to be reused by the block allocator.
> 
> If the discard happens after we reallocate the block, there is a good
> chance that we will end up corrupting a data or metadata block,
> leading to user data loss.

Yes

> 
> There's another corollary to this; if you use blk-wbt, and you are
> doing lots of deletes, and we move this all to a writeback thread,
> this *significantly* increases the chance that the user will see
> ENOSPC errors in the case where they are with a very full (close to
> 100% used) file system.

We would flush the kwork that's doing discard in this patch.
That's done in ext4_should_retry_alloc()

> 
> I'd argue that this is a *really* good reason why using mount -o
> discard is Just A Bad Idea if you are running with blk-wbt.  If
> discards are slow, using fstrim is a much better choice.  It's also
> the case that for most SSD's and workloads, doing frequent discards
> doesn't actually help that much.  The write endurance of the device is
> not compromised that much if you only run fs-trim and discard unused
> blocks once a day, or even once a week --- I only recommend use of
> mount -o discard in cases where the discard operation is effectively
> free.  (e.g., in cases where the FTL is implemented on the Host OS, or
> you are running with super-fast flash which is PCIe or NVMe attached.)

We're running ext4 with discard on a nbd device whose backend is storage
cluster. The discard can help to free the unused space to storage pool.

And sometimes application delete a lot of data and discard is flooding. 
Then we see the jbd2 commit kthread is blocked for a long time. Even
move the discard out of jbd2, we still see the write IO of jbd2 log
could be blocked. blk-wbt could help to relieve this. Finally the delay
is shift to allocation path. But this is better than blocking the page
fault path which holds the read mm->mmap_sem.

Best regards
Jianchao

> 
> Cheers,
> 
> 					- Ted
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] ext4: get discard out of jbd2 commit kthread
  2021-05-19  1:27       ` Wang Jianchao
@ 2021-05-19 15:08         ` Theodore Y. Ts'o
  2021-05-20  1:20           ` Wang Jianchao
  0 siblings, 1 reply; 7+ messages in thread
From: Theodore Y. Ts'o @ 2021-05-19 15:08 UTC (permalink / raw)
  To: Wang Jianchao; +Cc: Andreas Dilger, linux-ext4, linux-kernel

On Wed, May 19, 2021 at 09:27:56AM +0800, Wang Jianchao wrote:
> 
> We're running ext4 with discard on a nbd device whose backend is storage
> cluster. The discard can help to free the unused space to storage pool.
> 
> And sometimes application delete a lot of data and discard is flooding. 
> Then we see the jbd2 commit kthread is blocked for a long time. Even
> move the discard out of jbd2, we still see the write IO of jbd2 log
> could be blocked. blk-wbt could help to relieve this. Finally the delay
> is shift to allocation path. But this is better than blocking the page
> fault path which holds the read mm->mmap_sem.

I'm assuming that the problem is when the application deletes a lot of
data, the discard flood is causing performance problems on your nbd
server.  Is that the high level problem that you are trying to solve?

So if that's the case, I'd suggest a different approach.  First, move
kmem_cache_free(ext4_free_data_cachep, entry) out of
ext4_free_data_in_buddy() to its caller, ext4_process_data.  Then if
discard is enabled, after calling ext4_free_data_in_buddy(), the
ext4_free_data struct will be detached from rbtree rooted in
ext4_group_info.bb_free_root, and then we can attach it to a new
rbtree rooted in ext4_group_info.bb_discard_root.

This allows the block to be reused as soon the commit is finished
(allowing for potentially more efficient block allocations), but we
can now keep track of which blocks would be useful for discarding and
decouple that from when we release the blocks to be reused.  We can
now use the pre-existing fstrim kernel thread infrastructure to lock a
block group, and we can now iterate over the rbtree, and take into
account which blocks have since become allocated --- since if a block
has been allocated, there's no need to send a discard for it.

I think this will be more efficient, and will allow us to share more
of the code for fstrim and the discard-at-runtime model used by "mount
-o discard".  We can also fine-tune how quickly we issue discards; it
might be that if user has executed "rm -rf" it might actually better
to wait until the deletes have completed, even if it takes several
commit intervals, since it might allow us to combine discards if the
blocks 100-199 and 400-500 are released in one commit, and blocks
200-399 are released two or three commits later.

Something else I'd urge you to consider is whether it's possible to
enhance the nbd protocol to add some kind of back-channel notification
when the shared storage is getting low on space.  In that case, when
the nbd client code a request from the nbd server indicating, "please
issue discards if possible", it could either trigger an upcall to
userspace, which could then issue the fstrim ioctl, which in the case
where "mount -o discard" is enabled, would accelerate when discards
took place.

We could then make the fstrim thread normally work on a much slower
pace, but when there is a signal from the shared storage that space is
needed, clients could accelerate when they issue discards to free up
shared space.

Cheers,

						- Ted

P.S.  One other potential thought; if we have established a new
bb_discard_root rbtree, it *might* actually be beneficial to consider
using that information in the block allocator.  One of the best way to
tell an SSD that block is no longer needed is to simply overwrite that
block.  If we do that, we don't need to send a discard to that block
any more.

Of course, we still want to keep blocks contiguous since even though
seeks are free for SSD's, we want to keep large reads contiguous as
much as possible, and we want to keep the extent tree as compact as
possible.  But if we have just released a 12k file, and we are writing
a new 12k file, and don't really care *where* in the block group we
are writing that file, reusing blocks that had just been freed might
actually be a good strategy.

That's not something you need to implement in this patch series, but
it might be an interesting optimization.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] ext4: get discard out of jbd2 commit kthread
  2021-05-19 15:08         ` Theodore Y. Ts'o
@ 2021-05-20  1:20           ` Wang Jianchao
  0 siblings, 0 replies; 7+ messages in thread
From: Wang Jianchao @ 2021-05-20  1:20 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: Andreas Dilger, linux-ext4, linux-kernel



On 2021/5/19 11:08 PM, Theodore Y. Ts'o wrote:
> On Wed, May 19, 2021 at 09:27:56AM +0800, Wang Jianchao wrote:
>>
>> We're running ext4 with discard on a nbd device whose backend is storage
>> cluster. The discard can help to free the unused space to storage pool.
>>
>> And sometimes application delete a lot of data and discard is flooding. 
>> Then we see the jbd2 commit kthread is blocked for a long time. Even
>> move the discard out of jbd2, we still see the write IO of jbd2 log
>> could be blocked. blk-wbt could help to relieve this. Finally the delay
>> is shift to allocation path. But this is better than blocking the page
>> fault path which holds the read mm->mmap_sem.
> 
> I'm assuming that the problem is when the application deletes a lot of
> data, the discard flood is causing performance problems on your nbd
> server.  Is that the high level problem that you are trying to solve?
> 

Yes, not only the discard sometimes could be very slow, but also it
would degrade the performance of normal write IO

> So if that's the case, I'd suggest a different approach.  First, move
> kmem_cache_free(ext4_free_data_cachep, entry) out of
> ext4_free_data_in_buddy() to its caller, ext4_process_data.  Then if
> discard is enabled, after calling ext4_free_data_in_buddy(), the
> ext4_free_data struct will be detached from rbtree rooted in
> ext4_group_info.bb_free_root, and then we can attach it to a new
> rbtree rooted in ext4_group_info.bb_discard_root.
> 
> This allows the block to be reused as soon the commit is finished
> (allowing for potentially more efficient block allocations), but we
> can now keep track of which blocks would be useful for discarding and
> decouple that from when we release the blocks to be reused.  We can
> now use the pre-existing fstrim kernel thread infrastructure to lock a
> block group, and we can now iterate over the rbtree, and take into
> account which blocks have since become allocated --- since if a block
> has been allocated, there's no need to send a discard for it.
> 
> I think this will be more efficient, and will allow us to share more
> of the code for fstrim and the discard-at-runtime model used by "mount
> -o discard".  We can also fine-tune how quickly we issue discards; it
> might be that if user has executed "rm -rf" it might actually better
> to wait until the deletes have completed, even if it takes several
> commit intervals, since it might allow us to combine discards if the
> blocks 100-199 and 400-500 are released in one commit, and blocks
> 200-399 are released two or three commits later.


Yes, this is more efficient and fair. I will cook the next version path
based on the suggestion above.

> 
> Something else I'd urge you to consider is whether it's possible to
> enhance the nbd protocol to add some kind of back-channel notification
> when the shared storage is getting low on space.  In that case, when
> the nbd client code a request from the nbd server indicating, "please
> issue discards if possible", it could either trigger an upcall to
> userspace, which could then issue the fstrim ioctl, which in the case
> where "mount -o discard" is enabled, would accelerate when discards
> took place.
> 
> We could then make the fstrim thread normally work on a much slower
> pace, but when there is a signal from the shared storage that space is
> needed, clients could accelerate when they issue discards to free up
> shared space.

This sounds great !!!
I will share this with my colleagues to see how to implement it.

> 
> Cheers,
> 
> 						- Ted
> 
> P.S.  One other potential thought; if we have established a new
> bb_discard_root rbtree, it *might* actually be beneficial to consider
> using that information in the block allocator.  One of the best way to
> tell an SSD that block is no longer needed is to simply overwrite that
> block.  If we do that, we don't need to send a discard to that block
> any more.

This seems also true for the storage server. If the nbd client reuse the
blocks that's just freed, the server needn't to do new allocation.

> 
> Of course, we still want to keep blocks contiguous since even though
> seeks are free for SSD's, we want to keep large reads contiguous as
> much as possible, and we want to keep the extent tree as compact as
> possible.  But if we have just released a 12k file, and we are writing
> a new 12k file, and don't really care *where* in the block group we
> are writing that file, reusing blocks that had just been freed might
> actually be a good strategy.
> 
> That's not something you need to implement in this patch series, but
> it might be an interesting optimization.
> 

And thanks a million for your suggestions

Best Regards
Jianchao 

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-05-20  1:20 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-17  3:57 [PATCH] ext4: get discard out of jbd2 commit kthread Wang Jianchao
2021-05-17 20:52 ` Theodore Y. Ts'o
2021-05-18  1:19   ` Wang Jianchao
2021-05-18 14:57     ` Theodore Y. Ts'o
2021-05-19  1:27       ` Wang Jianchao
2021-05-19 15:08         ` Theodore Y. Ts'o
2021-05-20  1:20           ` Wang Jianchao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).