All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
@ 2013-01-22 10:49 Miao Xie
  2013-01-22 14:24 ` Liu Bo
  0 siblings, 1 reply; 17+ messages in thread
From: Miao Xie @ 2013-01-22 10:49 UTC (permalink / raw)
  To: Linux Btrfs; +Cc: Alex Lyakas

btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the
first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/
btrfs_queue_worker for this inode, and then it locks the list, checks the
head of the list again. But because we don't delete the first inode that it
deals with before, it will fetch the same inode. As a result, this function
allocates a huge amount of btrfs_delalloc_work structures, and OOM happens.

Fix this problem by splice this delalloc list.

Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/inode.c |   55 ++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67ed24a..86f1d25 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  */
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
-	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
 	struct inode *inode;
 	struct btrfs_delalloc_work *work, *next;
 	struct list_head works;
+	struct list_head splice;
 	int ret = 0;
 
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
 	INIT_LIST_HEAD(&works);
-
+	INIT_LIST_HEAD(&splice);
+again:
 	spin_lock(&root->fs_info->delalloc_lock);
-	while (!list_empty(head)) {
-		binode = list_entry(head->next, struct btrfs_inode,
+	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+	while (!list_empty(&splice)) {
+		binode = list_entry(splice.next, struct btrfs_inode,
 				    delalloc_inodes);
+
+		list_del_init(&binode->delalloc_inodes);
+
 		inode = igrab(&binode->vfs_inode);
 		if (!inode)
-			list_del_init(&binode->delalloc_inodes);
+			continue;
+
+		list_add_tail(&binode->delalloc_inodes,
+			      &root->fs_info->delalloc_inodes);
 		spin_unlock(&root->fs_info->delalloc_lock);
-		if (inode) {
-			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-			if (!work) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			list_add_tail(&work->list, &works);
-			btrfs_queue_worker(&root->fs_info->flush_workers,
-					   &work->work);
+
+		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+		if (unlikely(!work)) {
+			ret = -ENOMEM;
+			goto out;
 		}
+		list_add_tail(&work->list, &works);
+		btrfs_queue_worker(&root->fs_info->flush_workers,
+				   &work->work);
+
 		cond_resched();
 		spin_lock(&root->fs_info->delalloc_lock);
 	}
 	spin_unlock(&root->fs_info->delalloc_lock);
 
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	if (!list_empty(&root->fs_info->delalloc_inodes)) {
+		spin_unlock(&root->fs_info->delalloc_lock);
+		goto again;
+	}
+	spin_unlock(&root->fs_info->delalloc_lock);
+
 	/* the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
 	 * ordered extents get created before we return
@@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&root->fs_info->async_submit_draining);
+	return 0;
 out:
 	list_for_each_entry_safe(work, next, &works, list) {
 		list_del_init(&work->list);
 		btrfs_wait_and_free_delalloc_work(work);
 	}
+
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&root->fs_info->delalloc_lock);
+		list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
 	return ret;
 }
 
-- 
1.6.5.2

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-22 10:49 [PATCH 1/5] Btrfs: fix repeated delalloc work allocation Miao Xie
@ 2013-01-22 14:24 ` Liu Bo
  2013-01-23  2:54   ` Miao Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Liu Bo @ 2013-01-22 14:24 UTC (permalink / raw)
  To: Miao Xie; +Cc: Linux Btrfs, Alex Lyakas

On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote:
> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the
> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/
> btrfs_queue_worker for this inode, and then it locks the list, checks the
> head of the list again. But because we don't delete the first inode that it
> deals with before, it will fetch the same inode. As a result, this function
> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens.
> 
> Fix this problem by splice this delalloc list.
> 
> Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com>
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
>  fs/btrfs/inode.c |   55 ++++++++++++++++++++++++++++++++++++++++-------------
>  1 files changed, 41 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 67ed24a..86f1d25 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
>   */
>  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>  {
> -	struct list_head *head = &root->fs_info->delalloc_inodes;
>  	struct btrfs_inode *binode;
>  	struct inode *inode;
>  	struct btrfs_delalloc_work *work, *next;
>  	struct list_head works;
> +	struct list_head splice;
>  	int ret = 0;
>  
>  	if (root->fs_info->sb->s_flags & MS_RDONLY)
>  		return -EROFS;
>  
>  	INIT_LIST_HEAD(&works);
> -
> +	INIT_LIST_HEAD(&splice);
> +again:
>  	spin_lock(&root->fs_info->delalloc_lock);
> -	while (!list_empty(head)) {
> -		binode = list_entry(head->next, struct btrfs_inode,
> +	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
> +	while (!list_empty(&splice)) {
> +		binode = list_entry(splice.next, struct btrfs_inode,
>  				    delalloc_inodes);
> +
> +		list_del_init(&binode->delalloc_inodes);
> +

I believe this patch can work well, but it's a little complex.

How about adding a flag in runtime_flags set?

We can use the flag instead of 'delalloc_inodes' list to tell if we
have clear the delalloc bytes, and the most important thing is it
won't touch the original code logic too much.

thanks,
liubo

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67ed24a..692ed0e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 		BTRFS_I(inode)->delalloc_bytes -= len;
 
 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
-		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		    test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) {
+			clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags);
 		}
 		spin_unlock(&root->fs_info->delalloc_lock);
 	}
@@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 		binode = list_entry(head->next, struct btrfs_inode,
 				    delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
-		if (!inode)
-			list_del_init(&binode->delalloc_inodes);
+
+		list_del_init(&binode->delalloc_inodes);
+
 		spin_unlock(&root->fs_info->delalloc_lock);
 		if (inode) {
 			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
@@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 				goto out;
 			}
 			list_add_tail(&work->list, &works);
+			set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags);
 			btrfs_queue_worker(&root->fs_info->flush_workers,
 					   &work->work);
 		}
@@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	}
 	spin_unlock(&root->fs_info->delalloc_lock);
 
+	/* make sure we clear all delalloc bytes we have scheduled */
+	while (!list_empty(&works)) {
+		work = list_entry(works.next, struct btrfs_delalloc_work,
+				  list);
+		binode = btrfs_ino(work->inode);
+		if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) {
+			list_del_init(&work->list);
+			btrfs_wait_and_free_delalloc_work(work);
+		}
+		cond_resched();
+	}
+
 	/* the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
 	 * ordered extents get created before we return



>  		inode = igrab(&binode->vfs_inode);
>  		if (!inode)
> -			list_del_init(&binode->delalloc_inodes);
> +			continue;
> +
> +		list_add_tail(&binode->delalloc_inodes,
> +			      &root->fs_info->delalloc_inodes);
>  		spin_unlock(&root->fs_info->delalloc_lock);
> -		if (inode) {
> -			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> -			if (!work) {
> -				ret = -ENOMEM;
> -				goto out;
> -			}
> -			list_add_tail(&work->list, &works);
> -			btrfs_queue_worker(&root->fs_info->flush_workers,
> -					   &work->work);
> +
> +		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> +		if (unlikely(!work)) {
> +			ret = -ENOMEM;
> +			goto out;
>  		}
> +		list_add_tail(&work->list, &works);
> +		btrfs_queue_worker(&root->fs_info->flush_workers,
> +				   &work->work);
> +
>  		cond_resched();
>  		spin_lock(&root->fs_info->delalloc_lock);
>  	}
>  	spin_unlock(&root->fs_info->delalloc_lock);
>  
> +	list_for_each_entry_safe(work, next, &works, list) {
> +		list_del_init(&work->list);
> +		btrfs_wait_and_free_delalloc_work(work);
> +	}
> +
> +	spin_lock(&root->fs_info->delalloc_lock);
> +	if (!list_empty(&root->fs_info->delalloc_inodes)) {
> +		spin_unlock(&root->fs_info->delalloc_lock);
> +		goto again;
> +	}
> +	spin_unlock(&root->fs_info->delalloc_lock);
> +
>  	/* the filemap_flush will queue IO into the worker threads, but
>  	 * we have to make sure the IO is actually started and that
>  	 * ordered extents get created before we return
> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
>  	}
>  	atomic_dec(&root->fs_info->async_submit_draining);
> +	return 0;
>  out:
>  	list_for_each_entry_safe(work, next, &works, list) {
>  		list_del_init(&work->list);
>  		btrfs_wait_and_free_delalloc_work(work);
>  	}
> +
> +	if (!list_empty_careful(&splice)) {
> +		spin_lock(&root->fs_info->delalloc_lock);
> +		list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
> +		spin_unlock(&root->fs_info->delalloc_lock);
> +	}
>  	return ret;
>  }
>  
> -- 
> 1.6.5.2
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-22 14:24 ` Liu Bo
@ 2013-01-23  2:54   ` Miao Xie
  2013-01-23  3:56     ` Liu Bo
  0 siblings, 1 reply; 17+ messages in thread
From: Miao Xie @ 2013-01-23  2:54 UTC (permalink / raw)
  To: bo.li.liu; +Cc: Linux Btrfs, Alex Lyakas

On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote:
> On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote:
>> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the
>> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/
>> btrfs_queue_worker for this inode, and then it locks the list, checks the
>> head of the list again. But because we don't delete the first inode that it
>> deals with before, it will fetch the same inode. As a result, this function
>> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens.
>>
>> Fix this problem by splice this delalloc list.
>>
>> Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com>
>> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
>> ---
>>  fs/btrfs/inode.c |   55 ++++++++++++++++++++++++++++++++++++++++-------------
>>  1 files changed, 41 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 67ed24a..86f1d25 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
>>   */
>>  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>  {
>> -	struct list_head *head = &root->fs_info->delalloc_inodes;
>>  	struct btrfs_inode *binode;
>>  	struct inode *inode;
>>  	struct btrfs_delalloc_work *work, *next;
>>  	struct list_head works;
>> +	struct list_head splice;
>>  	int ret = 0;
>>  
>>  	if (root->fs_info->sb->s_flags & MS_RDONLY)
>>  		return -EROFS;
>>  
>>  	INIT_LIST_HEAD(&works);
>> -
>> +	INIT_LIST_HEAD(&splice);
>> +again:
>>  	spin_lock(&root->fs_info->delalloc_lock);
>> -	while (!list_empty(head)) {
>> -		binode = list_entry(head->next, struct btrfs_inode,
>> +	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
>> +	while (!list_empty(&splice)) {
>> +		binode = list_entry(splice.next, struct btrfs_inode,
>>  				    delalloc_inodes);
>> +
>> +		list_del_init(&binode->delalloc_inodes);
>> +
> 
> I believe this patch can work well, but it's a little complex.
> 
> How about adding a flag in runtime_flags set?

I have tried to adding a flag in runtime_flags, but I found it is not a good
way, because
- it can not avoid traversing the delalloc list repeatedly when someone write
  data into the file endlessly. In fact, it is unnecessary because we can just
  see that data as the one which is written after the flush is done.
- bit operation need lock the bus, but we have a spin lock to protect all
  the relative variants, so it is unnecessary.

besides that, there is something wrong with the following patch.

> We can use the flag instead of 'delalloc_inodes' list to tell if we
> have clear the delalloc bytes, and the most important thing is it
> won't touch the original code logic too much.
> 
> thanks,
> liubo
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 67ed24a..692ed0e 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
>  		BTRFS_I(inode)->delalloc_bytes -= len;
>  
>  		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
> -		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
> -			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
> +		    test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) {
> +			clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags);
>  		}

We can not remove list_del_init(), because the delalloc file can be flushed not only
by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke btrfs_sync_file().

>  		spin_unlock(&root->fs_info->delalloc_lock);
>  	}
> @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>  		binode = list_entry(head->next, struct btrfs_inode,
>  				    delalloc_inodes);
>  		inode = igrab(&binode->vfs_inode);
> -		if (!inode)
> -			list_del_init(&binode->delalloc_inodes);
> +
> +		list_del_init(&binode->delalloc_inodes);
> +
>  		spin_unlock(&root->fs_info->delalloc_lock);
>  		if (inode) {
>  			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>  				goto out;
>  			}
>  			list_add_tail(&work->list, &works);
> +			set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags);

if someone flush the file before set_bit(), no one will clear bit.

>  			btrfs_queue_worker(&root->fs_info->flush_workers,
>  					   &work->work);
>  		}
> @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>  	}
>  	spin_unlock(&root->fs_info->delalloc_lock);
>  
> +	/* make sure we clear all delalloc bytes we have scheduled */
> +	while (!list_empty(&works)) {
> +		work = list_entry(works.next, struct btrfs_delalloc_work,
> +				  list);
> +		binode = btrfs_ino(work->inode);

			^^^^^^BTRFS_I(), not btrfs_ino()

> +		if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) {
> +			list_del_init(&work->list);
> +			btrfs_wait_and_free_delalloc_work(work);

We must wait and free all the delalloc work here, or memory leak will happen.

Thanks
Miao

> +		}
> +		cond_resched();
> +	}
> +
>  	/* the filemap_flush will queue IO into the worker threads, but
>  	 * we have to make sure the IO is actually started and that
>  	 * ordered extents get created before we return
> 
> 
> 
>>  		inode = igrab(&binode->vfs_inode);
>>  		if (!inode)
>> -			list_del_init(&binode->delalloc_inodes);
>> +			continue;
>> +
>> +		list_add_tail(&binode->delalloc_inodes,
>> +			      &root->fs_info->delalloc_inodes);
>>  		spin_unlock(&root->fs_info->delalloc_lock);
>> -		if (inode) {
>> -			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>> -			if (!work) {
>> -				ret = -ENOMEM;
>> -				goto out;
>> -			}
>> -			list_add_tail(&work->list, &works);
>> -			btrfs_queue_worker(&root->fs_info->flush_workers,
>> -					   &work->work);
>> +
>> +		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>> +		if (unlikely(!work)) {
>> +			ret = -ENOMEM;
>> +			goto out;
>>  		}
>> +		list_add_tail(&work->list, &works);
>> +		btrfs_queue_worker(&root->fs_info->flush_workers,
>> +				   &work->work);
>> +
>>  		cond_resched();
>>  		spin_lock(&root->fs_info->delalloc_lock);
>>  	}
>>  	spin_unlock(&root->fs_info->delalloc_lock);
>>  
>> +	list_for_each_entry_safe(work, next, &works, list) {
>> +		list_del_init(&work->list);
>> +		btrfs_wait_and_free_delalloc_work(work);
>> +	}
>> +
>> +	spin_lock(&root->fs_info->delalloc_lock);
>> +	if (!list_empty(&root->fs_info->delalloc_inodes)) {
>> +		spin_unlock(&root->fs_info->delalloc_lock);
>> +		goto again;
>> +	}
>> +	spin_unlock(&root->fs_info->delalloc_lock);
>> +
>>  	/* the filemap_flush will queue IO into the worker threads, but
>>  	 * we have to make sure the IO is actually started and that
>>  	 * ordered extents get created before we return
>> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
>>  	}
>>  	atomic_dec(&root->fs_info->async_submit_draining);
>> +	return 0;
>>  out:
>>  	list_for_each_entry_safe(work, next, &works, list) {
>>  		list_del_init(&work->list);
>>  		btrfs_wait_and_free_delalloc_work(work);
>>  	}
>> +
>> +	if (!list_empty_careful(&splice)) {
>> +		spin_lock(&root->fs_info->delalloc_lock);
>> +		list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
>> +		spin_unlock(&root->fs_info->delalloc_lock);
>> +	}
>>  	return ret;
>>  }
>>  
>> -- 
>> 1.6.5.2
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  2:54   ` Miao Xie
@ 2013-01-23  3:56     ` Liu Bo
  2013-01-23  4:44       ` Miao Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Liu Bo @ 2013-01-23  3:56 UTC (permalink / raw)
  To: Miao Xie; +Cc: Linux Btrfs, Alex Lyakas

On Wed, Jan 23, 2013 at 10:54:39AM +0800, Miao Xie wrote:
> On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote:
> > On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote:
> >> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the
> >> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/
> >> btrfs_queue_worker for this inode, and then it locks the list, checks the
> >> head of the list again. But because we don't delete the first inode that it
> >> deals with before, it will fetch the same inode. As a result, this function
> >> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens.
> >>
> >> Fix this problem by splice this delalloc list.
> >>
> >> Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com>
> >> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> >> ---
> >>  fs/btrfs/inode.c |   55 ++++++++++++++++++++++++++++++++++++++++-------------
> >>  1 files changed, 41 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> >> index 67ed24a..86f1d25 100644
> >> --- a/fs/btrfs/inode.c
> >> +++ b/fs/btrfs/inode.c
> >> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
> >>   */
> >>  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
> >>  {
> >> -	struct list_head *head = &root->fs_info->delalloc_inodes;
> >>  	struct btrfs_inode *binode;
> >>  	struct inode *inode;
> >>  	struct btrfs_delalloc_work *work, *next;
> >>  	struct list_head works;
> >> +	struct list_head splice;
> >>  	int ret = 0;
> >>  
> >>  	if (root->fs_info->sb->s_flags & MS_RDONLY)
> >>  		return -EROFS;
> >>  
> >>  	INIT_LIST_HEAD(&works);
> >> -
> >> +	INIT_LIST_HEAD(&splice);
> >> +again:
> >>  	spin_lock(&root->fs_info->delalloc_lock);
> >> -	while (!list_empty(head)) {
> >> -		binode = list_entry(head->next, struct btrfs_inode,
> >> +	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
> >> +	while (!list_empty(&splice)) {
> >> +		binode = list_entry(splice.next, struct btrfs_inode,
> >>  				    delalloc_inodes);
> >> +
> >> +		list_del_init(&binode->delalloc_inodes);
> >> +
> > 
> > I believe this patch can work well, but it's a little complex.
> > 
> > How about adding a flag in runtime_flags set?
> 
> I have tried to adding a flag in runtime_flags, but I found it is not a good
> way, because
> - it can not avoid traversing the delalloc list repeatedly when someone write
>   data into the file endlessly. In fact, it is unnecessary because we can just
>   see that data as the one which is written after the flush is done.
> - bit operation need lock the bus, but we have a spin lock to protect all
>   the relative variants, so it is unnecessary.
> 
> besides that, there is something wrong with the following patch.

Okay, I see the problem.

But with [PATCH 4/5], I think maybe we can merge these two patches and
simplify things as following?

Just flush them once,

	spin_lock(&root->fs_info->delalloc_lock);
	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
	spin_unlock(&root->fs_info->delalloc_lock);

	while (!list_empty(&splice)) {
		...
	}

thanks,
liubo

> 
> > We can use the flag instead of 'delalloc_inodes' list to tell if we
> > have clear the delalloc bytes, and the most important thing is it
> > won't touch the original code logic too much.
> > 
> > thanks,
> > liubo
> > 
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 67ed24a..692ed0e 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
> >  		BTRFS_I(inode)->delalloc_bytes -= len;
> >  
> >  		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
> > -		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
> > -			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
> > +		    test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) {
> > +			clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags);
> >  		}
> 
> We can not remove list_del_init(), because the delalloc file can be flushed not only
> by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke btrfs_sync_file().
> 
> >  		spin_unlock(&root->fs_info->delalloc_lock);
> >  	}
> > @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
> >  		binode = list_entry(head->next, struct btrfs_inode,
> >  				    delalloc_inodes);
> >  		inode = igrab(&binode->vfs_inode);
> > -		if (!inode)
> > -			list_del_init(&binode->delalloc_inodes);
> > +
> > +		list_del_init(&binode->delalloc_inodes);
> > +
> >  		spin_unlock(&root->fs_info->delalloc_lock);
> >  		if (inode) {
> >  			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> > @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
> >  				goto out;
> >  			}
> >  			list_add_tail(&work->list, &works);
> > +			set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags);
> 
> if someone flush the file before set_bit(), no one will clear bit.
> 
> >  			btrfs_queue_worker(&root->fs_info->flush_workers,
> >  					   &work->work);
> >  		}
> > @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
> >  	}
> >  	spin_unlock(&root->fs_info->delalloc_lock);
> >  
> > +	/* make sure we clear all delalloc bytes we have scheduled */
> > +	while (!list_empty(&works)) {
> > +		work = list_entry(works.next, struct btrfs_delalloc_work,
> > +				  list);
> > +		binode = btrfs_ino(work->inode);
> 
> 			^^^^^^BTRFS_I(), not btrfs_ino()
> 
> > +		if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) {
> > +			list_del_init(&work->list);
> > +			btrfs_wait_and_free_delalloc_work(work);
> 
> We must wait and free all the delalloc work here, or memory leak will happen.
> 
> Thanks
> Miao
> 
> > +		}
> > +		cond_resched();
> > +	}
> > +
> >  	/* the filemap_flush will queue IO into the worker threads, but
> >  	 * we have to make sure the IO is actually started and that
> >  	 * ordered extents get created before we return
> > 
> > 
> > 
> >>  		inode = igrab(&binode->vfs_inode);
> >>  		if (!inode)
> >> -			list_del_init(&binode->delalloc_inodes);
> >> +			continue;
> >> +
> >> +		list_add_tail(&binode->delalloc_inodes,
> >> +			      &root->fs_info->delalloc_inodes);
> >>  		spin_unlock(&root->fs_info->delalloc_lock);
> >> -		if (inode) {
> >> -			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> >> -			if (!work) {
> >> -				ret = -ENOMEM;
> >> -				goto out;
> >> -			}
> >> -			list_add_tail(&work->list, &works);
> >> -			btrfs_queue_worker(&root->fs_info->flush_workers,
> >> -					   &work->work);
> >> +
> >> +		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> >> +		if (unlikely(!work)) {
> >> +			ret = -ENOMEM;
> >> +			goto out;
> >>  		}
> >> +		list_add_tail(&work->list, &works);
> >> +		btrfs_queue_worker(&root->fs_info->flush_workers,
> >> +				   &work->work);
> >> +
> >>  		cond_resched();
> >>  		spin_lock(&root->fs_info->delalloc_lock);
> >>  	}
> >>  	spin_unlock(&root->fs_info->delalloc_lock);
> >>  
> >> +	list_for_each_entry_safe(work, next, &works, list) {
> >> +		list_del_init(&work->list);
> >> +		btrfs_wait_and_free_delalloc_work(work);
> >> +	}
> >> +
> >> +	spin_lock(&root->fs_info->delalloc_lock);
> >> +	if (!list_empty(&root->fs_info->delalloc_inodes)) {
> >> +		spin_unlock(&root->fs_info->delalloc_lock);
> >> +		goto again;
> >> +	}
> >> +	spin_unlock(&root->fs_info->delalloc_lock);
> >> +
> >>  	/* the filemap_flush will queue IO into the worker threads, but
> >>  	 * we have to make sure the IO is actually started and that
> >>  	 * ordered extents get created before we return
> >> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
> >>  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
> >>  	}
> >>  	atomic_dec(&root->fs_info->async_submit_draining);
> >> +	return 0;
> >>  out:
> >>  	list_for_each_entry_safe(work, next, &works, list) {
> >>  		list_del_init(&work->list);
> >>  		btrfs_wait_and_free_delalloc_work(work);
> >>  	}
> >> +
> >> +	if (!list_empty_careful(&splice)) {
> >> +		spin_lock(&root->fs_info->delalloc_lock);
> >> +		list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
> >> +		spin_unlock(&root->fs_info->delalloc_lock);
> >> +	}
> >>  	return ret;
> >>  }
> >>  
> >> -- 
> >> 1.6.5.2
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  3:56     ` Liu Bo
@ 2013-01-23  4:44       ` Miao Xie
  2013-01-23  6:06         ` Liu Bo
  0 siblings, 1 reply; 17+ messages in thread
From: Miao Xie @ 2013-01-23  4:44 UTC (permalink / raw)
  To: bo.li.liu; +Cc: Linux Btrfs, Alex Lyakas

On 	wed, 23 Jan 2013 11:56:55 +0800, Liu Bo wrote:
> On Wed, Jan 23, 2013 at 10:54:39AM +0800, Miao Xie wrote:
>> On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote:
>>> On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote:
>>>> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the
>>>> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/
>>>> btrfs_queue_worker for this inode, and then it locks the list, checks the
>>>> head of the list again. But because we don't delete the first inode that it
>>>> deals with before, it will fetch the same inode. As a result, this function
>>>> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens.
>>>>
>>>> Fix this problem by splice this delalloc list.
>>>>
>>>> Reported-by: Alex Lyakas <alex.btrfs@zadarastorage.com>
>>>> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
>>>> ---
>>>>  fs/btrfs/inode.c |   55 ++++++++++++++++++++++++++++++++++++++++-------------
>>>>  1 files changed, 41 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>>> index 67ed24a..86f1d25 100644
>>>> --- a/fs/btrfs/inode.c
>>>> +++ b/fs/btrfs/inode.c
>>>> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
>>>>   */
>>>>  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>>>  {
>>>> -	struct list_head *head = &root->fs_info->delalloc_inodes;
>>>>  	struct btrfs_inode *binode;
>>>>  	struct inode *inode;
>>>>  	struct btrfs_delalloc_work *work, *next;
>>>>  	struct list_head works;
>>>> +	struct list_head splice;
>>>>  	int ret = 0;
>>>>  
>>>>  	if (root->fs_info->sb->s_flags & MS_RDONLY)
>>>>  		return -EROFS;
>>>>  
>>>>  	INIT_LIST_HEAD(&works);
>>>> -
>>>> +	INIT_LIST_HEAD(&splice);
>>>> +again:
>>>>  	spin_lock(&root->fs_info->delalloc_lock);
>>>> -	while (!list_empty(head)) {
>>>> -		binode = list_entry(head->next, struct btrfs_inode,
>>>> +	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
>>>> +	while (!list_empty(&splice)) {
>>>> +		binode = list_entry(splice.next, struct btrfs_inode,
>>>>  				    delalloc_inodes);
>>>> +
>>>> +		list_del_init(&binode->delalloc_inodes);
>>>> +
>>>
>>> I believe this patch can work well, but it's a little complex.
>>>
>>> How about adding a flag in runtime_flags set?
>>
>> I have tried to adding a flag in runtime_flags, but I found it is not a good
>> way, because
>> - it can not avoid traversing the delalloc list repeatedly when someone write
>>   data into the file endlessly. In fact, it is unnecessary because we can just
>>   see that data as the one which is written after the flush is done.
>> - bit operation need lock the bus, but we have a spin lock to protect all
>>   the relative variants, so it is unnecessary.
>>
>> besides that, there is something wrong with the following patch.
> 
> Okay, I see the problem.
> 
> But with [PATCH 4/5], I think maybe we can merge these two patches and
> simplify things as following?
> 
> Just flush them once,
> 
> 	spin_lock(&root->fs_info->delalloc_lock);
> 	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
> 	spin_unlock(&root->fs_info->delalloc_lock);
> 
> 	while (!list_empty(&splice)) {
> 		...
> 	}

No, we can't. The other tasks which flush the delalloc data may remove the inode
from the delalloc list/splice list. If we release the lock, we will meet the race
between list traversing and list_del().

Thanks
Miao

> 
> thanks,
> liubo
> 
>>
>>> We can use the flag instead of 'delalloc_inodes' list to tell if we
>>> have clear the delalloc bytes, and the most important thing is it
>>> won't touch the original code logic too much.
>>>
>>> thanks,
>>> liubo
>>>
>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> index 67ed24a..692ed0e 100644
>>> --- a/fs/btrfs/inode.c
>>> +++ b/fs/btrfs/inode.c
>>> @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
>>>  		BTRFS_I(inode)->delalloc_bytes -= len;
>>>  
>>>  		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
>>> -		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
>>> -			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
>>> +		    test_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags)) {
>>> +			clear_bit(BTRFS_INODE_FLUSH, &BTRFS_I(inode)->runtime_flags);
>>>  		}
>>
>> We can not remove list_del_init(), because the delalloc file can be flushed not only
>> by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke btrfs_sync_file().
>>
>>>  		spin_unlock(&root->fs_info->delalloc_lock);
>>>  	}
>>> @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>>  		binode = list_entry(head->next, struct btrfs_inode,
>>>  				    delalloc_inodes);
>>>  		inode = igrab(&binode->vfs_inode);
>>> -		if (!inode)
>>> -			list_del_init(&binode->delalloc_inodes);
>>> +
>>> +		list_del_init(&binode->delalloc_inodes);
>>> +
>>>  		spin_unlock(&root->fs_info->delalloc_lock);
>>>  		if (inode) {
>>>  			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>>> @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>>  				goto out;
>>>  			}
>>>  			list_add_tail(&work->list, &works);
>>> +			set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags);
>>
>> if someone flush the file before set_bit(), no one will clear bit.
>>
>>>  			btrfs_queue_worker(&root->fs_info->flush_workers,
>>>  					   &work->work);
>>>  		}
>>> @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>>  	}
>>>  	spin_unlock(&root->fs_info->delalloc_lock);
>>>  
>>> +	/* make sure we clear all delalloc bytes we have scheduled */
>>> +	while (!list_empty(&works)) {
>>> +		work = list_entry(works.next, struct btrfs_delalloc_work,
>>> +				  list);
>>> +		binode = btrfs_ino(work->inode);
>>
>> 			^^^^^^BTRFS_I(), not btrfs_ino()
>>
>>> +		if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) {
>>> +			list_del_init(&work->list);
>>> +			btrfs_wait_and_free_delalloc_work(work);
>>
>> We must wait and free all the delalloc work here, or memory leak will happen.
>>
>> Thanks
>> Miao
>>
>>> +		}
>>> +		cond_resched();
>>> +	}
>>> +
>>>  	/* the filemap_flush will queue IO into the worker threads, but
>>>  	 * we have to make sure the IO is actually started and that
>>>  	 * ordered extents get created before we return
>>>
>>>
>>>
>>>>  		inode = igrab(&binode->vfs_inode);
>>>>  		if (!inode)
>>>> -			list_del_init(&binode->delalloc_inodes);
>>>> +			continue;
>>>> +
>>>> +		list_add_tail(&binode->delalloc_inodes,
>>>> +			      &root->fs_info->delalloc_inodes);
>>>>  		spin_unlock(&root->fs_info->delalloc_lock);
>>>> -		if (inode) {
>>>> -			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>>>> -			if (!work) {
>>>> -				ret = -ENOMEM;
>>>> -				goto out;
>>>> -			}
>>>> -			list_add_tail(&work->list, &works);
>>>> -			btrfs_queue_worker(&root->fs_info->flush_workers,
>>>> -					   &work->work);
>>>> +
>>>> +		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>>>> +		if (unlikely(!work)) {
>>>> +			ret = -ENOMEM;
>>>> +			goto out;
>>>>  		}
>>>> +		list_add_tail(&work->list, &works);
>>>> +		btrfs_queue_worker(&root->fs_info->flush_workers,
>>>> +				   &work->work);
>>>> +
>>>>  		cond_resched();
>>>>  		spin_lock(&root->fs_info->delalloc_lock);
>>>>  	}
>>>>  	spin_unlock(&root->fs_info->delalloc_lock);
>>>>  
>>>> +	list_for_each_entry_safe(work, next, &works, list) {
>>>> +		list_del_init(&work->list);
>>>> +		btrfs_wait_and_free_delalloc_work(work);
>>>> +	}
>>>> +
>>>> +	spin_lock(&root->fs_info->delalloc_lock);
>>>> +	if (!list_empty(&root->fs_info->delalloc_inodes)) {
>>>> +		spin_unlock(&root->fs_info->delalloc_lock);
>>>> +		goto again;
>>>> +	}
>>>> +	spin_unlock(&root->fs_info->delalloc_lock);
>>>> +
>>>>  	/* the filemap_flush will queue IO into the worker threads, but
>>>>  	 * we have to make sure the IO is actually started and that
>>>>  	 * ordered extents get created before we return
>>>> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>>>  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
>>>>  	}
>>>>  	atomic_dec(&root->fs_info->async_submit_draining);
>>>> +	return 0;
>>>>  out:
>>>>  	list_for_each_entry_safe(work, next, &works, list) {
>>>>  		list_del_init(&work->list);
>>>>  		btrfs_wait_and_free_delalloc_work(work);
>>>>  	}
>>>> +
>>>> +	if (!list_empty_careful(&splice)) {
>>>> +		spin_lock(&root->fs_info->delalloc_lock);
>>>> +		list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
>>>> +		spin_unlock(&root->fs_info->delalloc_lock);
>>>> +	}
>>>>  	return ret;
>>>>  }
>>>>  
>>>> -- 
>>>> 1.6.5.2
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  4:44       ` Miao Xie
@ 2013-01-23  6:06         ` Liu Bo
  2013-01-23  6:33           ` Miao Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Liu Bo @ 2013-01-23  6:06 UTC (permalink / raw)
  To: Miao Xie; +Cc: Linux Btrfs, Alex Lyakas

On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
> No, we can't. The other tasks which flush the delalloc data may remove the inode
> from the delalloc list/splice list. If we release the lock, we will meet the race
> between list traversing and list_del().

OK, then please merge patch 1 and 4 so that we can backport 1 less patch
at least.

thanks,
liubo

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  6:06         ` Liu Bo
@ 2013-01-23  6:33           ` Miao Xie
  2013-01-23  8:17             ` Liu Bo
  0 siblings, 1 reply; 17+ messages in thread
From: Miao Xie @ 2013-01-23  6:33 UTC (permalink / raw)
  To: bo.li.liu; +Cc: Linux Btrfs, Alex Lyakas

On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
>> No, we can't. The other tasks which flush the delalloc data may remove the inode
>> from the delalloc list/splice list. If we release the lock, we will meet the race
>> between list traversing and list_del().
> 
> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
> at least.

I don't think we should merge these two patch because they do two different things - one
is bug fix, and the other is just a improvement, and this improvement changes the logic
of the code and might be argumentative for some developers. So 2 patches is  better than one,
I think.

Thanks
Miao


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  6:33           ` Miao Xie
@ 2013-01-23  8:17             ` Liu Bo
  2013-01-23  8:58               ` Miao Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Liu Bo @ 2013-01-23  8:17 UTC (permalink / raw)
  To: Miao Xie; +Cc: Linux Btrfs, Alex Lyakas

On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
> > On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
> >> No, we can't. The other tasks which flush the delalloc data may remove the inode
> >> from the delalloc list/splice list. If we release the lock, we will meet the race
> >> between list traversing and list_del().
> > 
> > OK, then please merge patch 1 and 4 so that we can backport 1 less patch
> > at least.
> 
> I don't think we should merge these two patch because they do two different things - one
> is bug fix, and the other is just a improvement, and this improvement changes the logic
> of the code and might be argumentative for some developers. So 2 patches is  better than one,
> I think.

Sorry, this is right only when patch 1 really fixes the problem Alex reported.

But the fact is

1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
requests remains.  We can still get the same inode over and over again
and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
make sure we flush all inodes listed in fs_info->delalloc_inodes.

2)  patch 4 fixes 1)'s problems by removing 'goto again'.

Am I missing something?

thanks,
liubo

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  8:17             ` Liu Bo
@ 2013-01-23  8:58               ` Miao Xie
  2013-01-23  9:21                 ` Alex Lyakas
  2013-01-23  9:52                 ` Liu Bo
  0 siblings, 2 replies; 17+ messages in thread
From: Miao Xie @ 2013-01-23  8:58 UTC (permalink / raw)
  To: bo.li.liu; +Cc: Linux Btrfs, Alex Lyakas

On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote:
> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode
>>>> from the delalloc list/splice list. If we release the lock, we will meet the race
>>>> between list traversing and list_del().
>>>
>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
>>> at least.
>>
>> I don't think we should merge these two patch because they do two different things - one
>> is bug fix, and the other is just a improvement, and this improvement changes the logic
>> of the code and might be argumentative for some developers. So 2 patches is  better than one,
>> I think.
> 
> Sorry, this is right only when patch 1 really fixes the problem Alex reported.
> 
> But the fact is
> 
> 1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
> requests remains.  We can still get the same inode over and over again
> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
> make sure we flush all inodes listed in fs_info->delalloc_inodes.
> 
> 2)  patch 4 fixes 1)'s problems by removing 'goto again'.
> 
> Am I missing something?

In fact, there are two different problems.
One is OOM bug. it is a serious problem and also is an regression, so we should fix it
as soon as possible.
The other one is that we may fetch the same inode again and again if someone write data
into it endlessly. This problem is not so urgent to deal with. and perhaps some developers
think it is not a problem, we should flush that inode since there are dirty pages in it.
So we need split it from the patch of the 1st problem.

Thanks
Miao

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  8:58               ` Miao Xie
@ 2013-01-23  9:21                 ` Alex Lyakas
  2013-01-23  9:52                 ` Liu Bo
  1 sibling, 0 replies; 17+ messages in thread
From: Alex Lyakas @ 2013-01-23  9:21 UTC (permalink / raw)
  To: miaox; +Cc: bo.li.liu, Linux Btrfs

Hi Miao,
thank you for addressing the issue. I will try out this 5/5 patchset
and let you know what I see. I will apply it on top of latest
for-linus branch.

Thanks,
Alex.


On Wed, Jan 23, 2013 at 10:58 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote:
>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode
>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race
>>>>> between list traversing and list_del().
>>>>
>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
>>>> at least.
>>>
>>> I don't think we should merge these two patch because they do two different things - one
>>> is bug fix, and the other is just a improvement, and this improvement changes the logic
>>> of the code and might be argumentative for some developers. So 2 patches is  better than one,
>>> I think.
>>
>> Sorry, this is right only when patch 1 really fixes the problem Alex reported.
>>
>> But the fact is
>>
>> 1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
>> requests remains.  We can still get the same inode over and over again
>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
>> make sure we flush all inodes listed in fs_info->delalloc_inodes.
>>
>> 2)  patch 4 fixes 1)'s problems by removing 'goto again'.
>>
>> Am I missing something?
>
> In fact, there are two different problems.
> One is OOM bug. it is a serious problem and also is an regression, so we should fix it
> as soon as possible.
> The other one is that we may fetch the same inode again and again if someone write data
> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers
> think it is not a problem, we should flush that inode since there are dirty pages in it.
> So we need split it from the patch of the 1st problem.
>
> Thanks
> Miao

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  8:58               ` Miao Xie
  2013-01-23  9:21                 ` Alex Lyakas
@ 2013-01-23  9:52                 ` Liu Bo
  2013-01-23 10:20                   ` Miao Xie
  1 sibling, 1 reply; 17+ messages in thread
From: Liu Bo @ 2013-01-23  9:52 UTC (permalink / raw)
  To: Miao Xie; +Cc: Linux Btrfs, Alex Lyakas

On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote:
> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote:
> > On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
> >> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
> >>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
> >>>> No, we can't. The other tasks which flush the delalloc data may remove the inode
> >>>> from the delalloc list/splice list. If we release the lock, we will meet the race
> >>>> between list traversing and list_del().
> >>>
> >>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
> >>> at least.
> >>
> >> I don't think we should merge these two patch because they do two different things - one
> >> is bug fix, and the other is just a improvement, and this improvement changes the logic
> >> of the code and might be argumentative for some developers. So 2 patches is  better than one,
> >> I think.
> > 
> > Sorry, this is right only when patch 1 really fixes the problem Alex reported.
> > 
> > But the fact is
> > 
> > 1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
> > allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
> > requests remains.  We can still get the same inode over and over again
> > and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
> > make sure we flush all inodes listed in fs_info->delalloc_inodes.
> > 
> > 2)  patch 4 fixes 1)'s problems by removing 'goto again'.
> > 
> > Am I missing something?
> 
> In fact, there are two different problems.
> One is OOM bug. it is a serious problem and also is an regression, so we should fix it
> as soon as possible.
> The other one is that we may fetch the same inode again and again if someone write data
> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers
> think it is not a problem, we should flush that inode since there are dirty pages in it.
> So we need split it from the patch of the 1st problem.

All right, I'm ok with this.

But the TWO different problems are both due to fetching the same inode more
than once, and the solutions are indeed same, "fetch every inode on
the list just once", and they are in the same code.

It is definitely a bug/problem if any users complains about their box
getting stuck.  It is KERNEL that should be blamed.

thanks,
liubo

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23  9:52                 ` Liu Bo
@ 2013-01-23 10:20                   ` Miao Xie
  2013-01-23 17:02                     ` Alex Lyakas
  0 siblings, 1 reply; 17+ messages in thread
From: Miao Xie @ 2013-01-23 10:20 UTC (permalink / raw)
  To: bo.li.liu; +Cc: Linux Btrfs, Alex Lyakas

On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote:
> On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote:
>> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote:
>>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
>>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
>>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
>>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode
>>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race
>>>>>> between list traversing and list_del().
>>>>>
>>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
>>>>> at least.
>>>>
>>>> I don't think we should merge these two patch because they do two different things - one
>>>> is bug fix, and the other is just a improvement, and this improvement changes the logic
>>>> of the code and might be argumentative for some developers. So 2 patches is  better than one,
>>>> I think.
>>>
>>> Sorry, this is right only when patch 1 really fixes the problem Alex reported.
>>>
>>> But the fact is
>>>
>>> 1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
>>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
>>> requests remains.  We can still get the same inode over and over again
>>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
>>> make sure we flush all inodes listed in fs_info->delalloc_inodes.
>>>
>>> 2)  patch 4 fixes 1)'s problems by removing 'goto again'.
>>>
>>> Am I missing something?
>>
>> In fact, there are two different problems.
>> One is OOM bug. it is a serious problem and also is an regression, so we should fix it
>> as soon as possible.
>> The other one is that we may fetch the same inode again and again if someone write data
>> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers
>> think it is not a problem, we should flush that inode since there are dirty pages in it.
>> So we need split it from the patch of the 1st problem.
> 
> All right, I'm ok with this.
> 
> But the TWO different problems are both due to fetching the same inode more
> than once, and the solutions are indeed same, "fetch every inode on
> the list just once", and they are in the same code.

I forgot to say that the first problem can happen even though no task writes data into
the file after we start to flush the delalloc inodes.

Thanks
Miao

> 
> It is definitely a bug/problem if any users complains about their box
> getting stuck.  It is KERNEL that should be blamed.
> 
> thanks,
> liubo
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23 10:20                   ` Miao Xie
@ 2013-01-23 17:02                     ` Alex Lyakas
  2013-01-24  2:14                       ` Miao Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Lyakas @ 2013-01-23 17:02 UTC (permalink / raw)
  To: miaox; +Cc: bo.li.liu, Linux Btrfs

Hi Miao,
I have tested your patch, and the two discussed issues  (OOM and
handling the same inode more than once) are solved with it.

However, snap creation under IO still takes 5-10 minutes for me.
Basically, now the situation is similar to kernel 3.6, before your
change to push the work to the flush workers. I see that flush worker
is stuck in one of two stacks like this:

[<ffffffff81301f1d>] get_request+0x14d/0x330
[<ffffffff813030a4>] blk_queue_bio+0x84/0x3b0
[<ffffffff812ff9a7>] generic_make_request.part.55+0x77/0xb0
[<ffffffff812fff48>] generic_make_request+0x68/0x70
[<ffffffff812fffcb>] submit_bio+0x7b/0x160
[<ffffffffa02fe76e>] submit_stripe_bio+0x5e/0x80 [btrfs]
[<ffffffffa03050db>] btrfs_map_bio+0x1cb/0x420 [btrfs]
[<ffffffffa02e1b14>] btrfs_submit_bio_hook+0xb4/0x1d0 [btrfs]
[<ffffffffa02f5b1a>] submit_one_bio+0x6a/0xa0 [btrfs]
[<ffffffffa02f99c4>] submit_extent_page.isra.38+0xe4/0x200 [btrfs]
[<ffffffffa02fa83c>] __extent_writepage+0x5dc/0x750 [btrfs]
[<ffffffffa02fac6a>]
extent_write_cache_pages.isra.29.constprop.42+0x2ba/0x410 [btrfs]
[<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs]
[<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs]
[<ffffffff81136510>] do_writepages+0x20/0x40
[<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60
[<ffffffff8112cc3c>] filemap_flush+0x1c/0x20
[<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs]
[<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs]
[<ffffffff8107ba50>] kthread+0xc0/0xd0
[<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
[<ffffffffffffffff>] 0xffffffffffffffff

or

[<ffffffff8112a58e>] sleep_on_page+0xe/0x20
[<ffffffff8112a577>] __lock_page+0x67/0x70
[<ffffffffa02fabe9>]
extent_write_cache_pages.isra.29.constprop.42+0x239/0x410 [btrfs]
[<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs]
[<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs]
[<ffffffff81136510>] do_writepages+0x20/0x40
[<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60
[<ffffffff8112cc3c>] filemap_flush+0x1c/0x20
[<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs]
[<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs]
[<ffffffff8107ba50>] kthread+0xc0/0xd0
[<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
[<ffffffffffffffff>] 0xffffffffffffffff

while btrfs_start_delalloc_inodes() waits for it to complete in the
commit thread. Also for some reason, I have only one "flush_workers"
thread, so switching to another thread does not improve for me.

Another operation that takes time (up to one minute) is
btrfs_wait_ordered_extents(), which does similar thing by switching
the work to the flush worker. In this case, the
btrfs_start_ordered_extent() is stuck in stacks like follows:
[<ffffffffa04e6fc5>] btrfs_start_ordered_extent+0x85/0x130 [btrfs]
[<ffffffffa04e70e1>] btrfs_run_ordered_extent_work+0x71/0xd0 [btrfs]
[<ffffffffa04facef>] worker_loop+0x16f/0x5d0 [btrfs]
[<ffffffff8107ba50>] kthread+0xc0/0xd0
[<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
[<ffffffffffffffff>] 0xffffffffffffffff
where it waits for BTRFS_ORDERED_COMPLETE.

I have several questions, on how to possibly address this issue:

- I see that btrfs_flush_all_pending_stuffs() is invoked at least
twice during each transaction commit. It may be invoked more than
twice if the do{ } while loop that waits for writes, performs more
than one iteration. For me, each invocation takes a lot of time
because of btrfs_start_delalloc_inodes() and
btrfs_wait_ordered_extents(). Given the fixes you have made (handling
each inode only once), is it still needed to call these functions
several times during the same commit?

- I see that during a commit without pending snapshots, these two
functions are not invoked at all.
	if (flush_on_commit || snap_pending) {
		ret = btrfs_start_delalloc_inodes(root, 1);
		if (ret)
			return ret;
		btrfs_wait_ordered_extents(root, 1);
	}
The FLUSHONCOMMIT mount option is normally *not set*, I see in the
wiki that it is "Not needed with recent kernels, as it's the normal
mode of operation. "
Can you pls explain why the delalloc is needed when there is a pending
snap, but not with a non-snap commit? Or pls point me to the code,
where I can better understand what delalloc inode is and how it is
related to FLUSHONCOMMIT or snapshot? Can you pls explain what the
FLUSHONCOMMIT mount option is about?

I understand from your explanation that without flushing the delalloc,
the data in the snap may be different from the subvolume data. But
this may happen anyways, since the IO to the subvolume is not blocked
during transaction commit (at least it doesn't look that it's
blocked).

- Is there something that user-space can do to avoid flushing the
delalloc during snap-commit? For example, if the user-space stops the
IO and does a normal commit, this will not call
btrfs_start_delalloc_inodes(), but this should ensure that *some* data
is safe on disk. Then the user-space triggers snap creation (which is
another commit), and then resume the IO? Because without the ongoing
IO, snap creation is very fast.

Thanks for your help,
Alex.

P.S.: As I am writing this email, snap creation is stuck for 30
minutes, calling btrfs_flush_all_pending_stuffs() again and again....




On Wed, Jan 23, 2013 at 12:20 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote:
>> On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote:
>>> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote:
>>>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
>>>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
>>>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
>>>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode
>>>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race
>>>>>>> between list traversing and list_del().
>>>>>>
>>>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
>>>>>> at least.
>>>>>
>>>>> I don't think we should merge these two patch because they do two different things - one
>>>>> is bug fix, and the other is just a improvement, and this improvement changes the logic
>>>>> of the code and might be argumentative for some developers. So 2 patches is  better than one,
>>>>> I think.
>>>>
>>>> Sorry, this is right only when patch 1 really fixes the problem Alex reported.
>>>>
>>>> But the fact is
>>>>
>>>> 1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
>>>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
>>>> requests remains.  We can still get the same inode over and over again
>>>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
>>>> make sure we flush all inodes listed in fs_info->delalloc_inodes.
>>>>
>>>> 2)  patch 4 fixes 1)'s problems by removing 'goto again'.
>>>>
>>>> Am I missing something?
>>>
>>> In fact, there are two different problems.
>>> One is OOM bug. it is a serious problem and also is an regression, so we should fix it
>>> as soon as possible.
>>> The other one is that we may fetch the same inode again and again if someone write data
>>> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers
>>> think it is not a problem, we should flush that inode since there are dirty pages in it.
>>> So we need split it from the patch of the 1st problem.
>>
>> All right, I'm ok with this.
>>
>> But the TWO different problems are both due to fetching the same inode more
>> than once, and the solutions are indeed same, "fetch every inode on
>> the list just once", and they are in the same code.
>
> I forgot to say that the first problem can happen even though no task writes data into
> the file after we start to flush the delalloc inodes.
>
> Thanks
> Miao
>
>>
>> It is definitely a bug/problem if any users complains about their box
>> getting stuck.  It is KERNEL that should be blamed.
>>
>> thanks,
>> liubo
>>
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-23 17:02                     ` Alex Lyakas
@ 2013-01-24  2:14                       ` Miao Xie
  2013-01-24 16:20                         ` Alex Lyakas
  0 siblings, 1 reply; 17+ messages in thread
From: Miao Xie @ 2013-01-24  2:14 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: bo.li.liu, Linux Btrfs

On wed, 23 Jan 2013 19:02:30 +0200, Alex Lyakas wrote:
> Hi Miao,
> I have tested your patch, and the two discussed issues  (OOM and
> handling the same inode more than once) are solved with it.
> 
> However, snap creation under IO still takes 5-10 minutes for me.
> Basically, now the situation is similar to kernel 3.6, before your
> change to push the work to the flush workers. I see that flush worker
> is stuck in one of two stacks like this:
> 
> [<ffffffff81301f1d>] get_request+0x14d/0x330
> [<ffffffff813030a4>] blk_queue_bio+0x84/0x3b0
> [<ffffffff812ff9a7>] generic_make_request.part.55+0x77/0xb0
> [<ffffffff812fff48>] generic_make_request+0x68/0x70
> [<ffffffff812fffcb>] submit_bio+0x7b/0x160
> [<ffffffffa02fe76e>] submit_stripe_bio+0x5e/0x80 [btrfs]
> [<ffffffffa03050db>] btrfs_map_bio+0x1cb/0x420 [btrfs]
> [<ffffffffa02e1b14>] btrfs_submit_bio_hook+0xb4/0x1d0 [btrfs]
> [<ffffffffa02f5b1a>] submit_one_bio+0x6a/0xa0 [btrfs]
> [<ffffffffa02f99c4>] submit_extent_page.isra.38+0xe4/0x200 [btrfs]
> [<ffffffffa02fa83c>] __extent_writepage+0x5dc/0x750 [btrfs]
> [<ffffffffa02fac6a>]
> extent_write_cache_pages.isra.29.constprop.42+0x2ba/0x410 [btrfs]
> [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs]
> [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs]
> [<ffffffff81136510>] do_writepages+0x20/0x40
> [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60
> [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20
> [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs]
> [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs]
> [<ffffffff8107ba50>] kthread+0xc0/0xd0
> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> or
> 
> [<ffffffff8112a58e>] sleep_on_page+0xe/0x20
> [<ffffffff8112a577>] __lock_page+0x67/0x70
> [<ffffffffa02fabe9>]
> extent_write_cache_pages.isra.29.constprop.42+0x239/0x410 [btrfs]
> [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs]
> [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs]
> [<ffffffff81136510>] do_writepages+0x20/0x40
> [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60
> [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20
> [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs]
> [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs]
> [<ffffffff8107ba50>] kthread+0xc0/0xd0
> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> while btrfs_start_delalloc_inodes() waits for it to complete in the
> commit thread. Also for some reason, I have only one "flush_workers"
> thread, so switching to another thread does not improve for me.

it is because the number of the inodes is very few, so the worker don't
create new workers to deal with the new works. Maybe we can change
flush_workers->idle_thresh to improve this problem.

> Another operation that takes time (up to one minute) is
> btrfs_wait_ordered_extents(), which does similar thing by switching
> the work to the flush worker. In this case, the
> btrfs_start_ordered_extent() is stuck in stacks like follows:
> [<ffffffffa04e6fc5>] btrfs_start_ordered_extent+0x85/0x130 [btrfs]
> [<ffffffffa04e70e1>] btrfs_run_ordered_extent_work+0x71/0xd0 [btrfs]
> [<ffffffffa04facef>] worker_loop+0x16f/0x5d0 [btrfs]
> [<ffffffff8107ba50>] kthread+0xc0/0xd0
> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
> [<ffffffffffffffff>] 0xffffffffffffffff
> where it waits for BTRFS_ORDERED_COMPLETE.
> 
> I have several questions, on how to possibly address this issue:
> 
> - I see that btrfs_flush_all_pending_stuffs() is invoked at least
> twice during each transaction commit. It may be invoked more than
> twice if the do{ } while loop that waits for writes, performs more
> than one iteration. For me, each invocation takes a lot of time
> because of btrfs_start_delalloc_inodes() and
> btrfs_wait_ordered_extents(). Given the fixes you have made (handling
> each inode only once), is it still needed to call these functions
> several times during the same commit?

Calling it in the while loop is because we don't know how much time we need
to wait for the end of the other transaction handle, so we flush the delalloc
inodes instead of just waiting.

Maybe we need not call btrfs_wait_ordered_extents() in the while loop, just
call btrfs_start_delalloc_inodes() to start flush.

> - I see that during a commit without pending snapshots, these two
> functions are not invoked at all.
> 	if (flush_on_commit || snap_pending) {
> 		ret = btrfs_start_delalloc_inodes(root, 1);
> 		if (ret)
> 			return ret;
> 		btrfs_wait_ordered_extents(root, 1);
> 	}
> The FLUSHONCOMMIT mount option is normally *not set*, I see in the
> wiki that it is "Not needed with recent kernels, as it's the normal
> mode of operation. "
> Can you pls explain why the delalloc is needed when there is a pending
> snap, but not with a non-snap commit? Or pls point me to the code,
> where I can better understand what delalloc inode is and how it is
> related to FLUSHONCOMMIT or snapshot? Can you pls explain what the
> FLUSHONCOMMIT mount option is about?
> 
> I understand from your explanation that without flushing the delalloc,
> the data in the snap may be different from the subvolume data. But
> this may happen anyways, since the IO to the subvolume is not blocked
> during transaction commit (at least it doesn't look that it's
> blocked).

There are two cases that the data in the snap may be different from the
subvolume data without flushing the delalloc.

1st:
	User0
	write(fd, buf, len)
	create snap

2nd:
	User0				User1
	create snap
	  commit_transaction
					write(fd, buf, len)
	    create_pending_snapshot

For the 1st case, the user doesn't expect the data is different between source
subvolume and the snapshot.

For the 2nd case, it doesn't matter.

So in order to avoid the 1st case, we need flush the delalloc before snapshot
creation.

> - Is there something that user-space can do to avoid flushing the
> delalloc during snap-commit? For example, if the user-space stops the
> IO and does a normal commit, this will not call
> btrfs_start_delalloc_inodes(), but this should ensure that *some* data
> is safe on disk. Then the user-space triggers snap creation (which is
> another commit), and then resume the IO? Because without the ongoing
> IO, snap creation is very fast.

We can sync the fs before creating snapshot, in this way, we needn't flush
the delalloc while committing the transaction. But I don't think it is good
way. Because for the users, the page cache is transparent.

I have a idea to improve this problem:
- introduce per-subvolume delalloc inode list
- flush the delalloc inode list of the source subvolume in create_snapshot(),
  not flush it in commit_transaction(). In this way, we just flush once.
- don't do the delalloc flush in commit_transaction() if there are pending snapshots
- If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop,
  and call btrfs_wait_ordered_extents() after the while loop.

How do you think about this idea?

Thanks
Miao

> Thanks for your help,
> Alex.
> 
> P.S.: As I am writing this email, snap creation is stuck for 30
> minutes, calling btrfs_flush_all_pending_stuffs() again and again....
> 
> 
> 
> 
> On Wed, Jan 23, 2013 at 12:20 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>> On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote:
>>> On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote:
>>>> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote:
>>>>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
>>>>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
>>>>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
>>>>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode
>>>>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race
>>>>>>>> between list traversing and list_del().
>>>>>>>
>>>>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
>>>>>>> at least.
>>>>>>
>>>>>> I don't think we should merge these two patch because they do two different things - one
>>>>>> is bug fix, and the other is just a improvement, and this improvement changes the logic
>>>>>> of the code and might be argumentative for some developers. So 2 patches is  better than one,
>>>>>> I think.
>>>>>
>>>>> Sorry, this is right only when patch 1 really fixes the problem Alex reported.
>>>>>
>>>>> But the fact is
>>>>>
>>>>> 1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
>>>>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
>>>>> requests remains.  We can still get the same inode over and over again
>>>>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
>>>>> make sure we flush all inodes listed in fs_info->delalloc_inodes.
>>>>>
>>>>> 2)  patch 4 fixes 1)'s problems by removing 'goto again'.
>>>>>
>>>>> Am I missing something?
>>>>
>>>> In fact, there are two different problems.
>>>> One is OOM bug. it is a serious problem and also is an regression, so we should fix it
>>>> as soon as possible.
>>>> The other one is that we may fetch the same inode again and again if someone write data
>>>> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers
>>>> think it is not a problem, we should flush that inode since there are dirty pages in it.
>>>> So we need split it from the patch of the 1st problem.
>>>
>>> All right, I'm ok with this.
>>>
>>> But the TWO different problems are both due to fetching the same inode more
>>> than once, and the solutions are indeed same, "fetch every inode on
>>> the list just once", and they are in the same code.
>>
>> I forgot to say that the first problem can happen even though no task writes data into
>> the file after we start to flush the delalloc inodes.
>>
>> Thanks
>> Miao
>>
>>>
>>> It is definitely a bug/problem if any users complains about their box
>>> getting stuck.  It is KERNEL that should be blamed.
>>>
>>> thanks,
>>> liubo
>>>
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-24  2:14                       ` Miao Xie
@ 2013-01-24 16:20                         ` Alex Lyakas
  2013-01-25  6:09                           ` Miao Xie
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Lyakas @ 2013-01-24 16:20 UTC (permalink / raw)
  To: miaox; +Cc: bo.li.liu, Linux Btrfs

Hi Miao,

On Thu, Jan 24, 2013 at 4:14 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> On wed, 23 Jan 2013 19:02:30 +0200, Alex Lyakas wrote:
>> Hi Miao,
>> I have tested your patch, and the two discussed issues  (OOM and
>> handling the same inode more than once) are solved with it.
>>
>> However, snap creation under IO still takes 5-10 minutes for me.
>> Basically, now the situation is similar to kernel 3.6, before your
>> change to push the work to the flush workers. I see that flush worker
>> is stuck in one of two stacks like this:
>>
>> [<ffffffff81301f1d>] get_request+0x14d/0x330
>> [<ffffffff813030a4>] blk_queue_bio+0x84/0x3b0
>> [<ffffffff812ff9a7>] generic_make_request.part.55+0x77/0xb0
>> [<ffffffff812fff48>] generic_make_request+0x68/0x70
>> [<ffffffff812fffcb>] submit_bio+0x7b/0x160
>> [<ffffffffa02fe76e>] submit_stripe_bio+0x5e/0x80 [btrfs]
>> [<ffffffffa03050db>] btrfs_map_bio+0x1cb/0x420 [btrfs]
>> [<ffffffffa02e1b14>] btrfs_submit_bio_hook+0xb4/0x1d0 [btrfs]
>> [<ffffffffa02f5b1a>] submit_one_bio+0x6a/0xa0 [btrfs]
>> [<ffffffffa02f99c4>] submit_extent_page.isra.38+0xe4/0x200 [btrfs]
>> [<ffffffffa02fa83c>] __extent_writepage+0x5dc/0x750 [btrfs]
>> [<ffffffffa02fac6a>]
>> extent_write_cache_pages.isra.29.constprop.42+0x2ba/0x410 [btrfs]
>> [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs]
>> [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs]
>> [<ffffffff81136510>] do_writepages+0x20/0x40
>> [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60
>> [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20
>> [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs]
>> [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs]
>> [<ffffffff8107ba50>] kthread+0xc0/0xd0
>> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
>> [<ffffffffffffffff>] 0xffffffffffffffff
>>
>> or
>>
>> [<ffffffff8112a58e>] sleep_on_page+0xe/0x20
>> [<ffffffff8112a577>] __lock_page+0x67/0x70
>> [<ffffffffa02fabe9>]
>> extent_write_cache_pages.isra.29.constprop.42+0x239/0x410 [btrfs]
>> [<ffffffffa02fb00e>] extent_writepages+0x4e/0x70 [btrfs]
>> [<ffffffffa02e0898>] btrfs_writepages+0x28/0x30 [btrfs]
>> [<ffffffff81136510>] do_writepages+0x20/0x40
>> [<ffffffff8112c341>] __filemap_fdatawrite_range+0x51/0x60
>> [<ffffffff8112cc3c>] filemap_flush+0x1c/0x20
>> [<ffffffffa02e416d>] btrfs_run_delalloc_work+0xad/0xe0 [btrfs]
>> [<ffffffffa0308c1f>] worker_loop+0x16f/0x5d0 [btrfs]
>> [<ffffffff8107ba50>] kthread+0xc0/0xd0
>> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
>> [<ffffffffffffffff>] 0xffffffffffffffff
>>
>> while btrfs_start_delalloc_inodes() waits for it to complete in the
>> commit thread. Also for some reason, I have only one "flush_workers"
>> thread, so switching to another thread does not improve for me.
>
> it is because the number of the inodes is very few, so the worker don't
> create new workers to deal with the new works. Maybe we can change
> flush_workers->idle_thresh to improve this problem.
>
>> Another operation that takes time (up to one minute) is
>> btrfs_wait_ordered_extents(), which does similar thing by switching
>> the work to the flush worker. In this case, the
>> btrfs_start_ordered_extent() is stuck in stacks like follows:
>> [<ffffffffa04e6fc5>] btrfs_start_ordered_extent+0x85/0x130 [btrfs]
>> [<ffffffffa04e70e1>] btrfs_run_ordered_extent_work+0x71/0xd0 [btrfs]
>> [<ffffffffa04facef>] worker_loop+0x16f/0x5d0 [btrfs]
>> [<ffffffff8107ba50>] kthread+0xc0/0xd0
>> [<ffffffff8169d66c>] ret_from_fork+0x7c/0xb0
>> [<ffffffffffffffff>] 0xffffffffffffffff
>> where it waits for BTRFS_ORDERED_COMPLETE.
>>
>> I have several questions, on how to possibly address this issue:
>>
>> - I see that btrfs_flush_all_pending_stuffs() is invoked at least
>> twice during each transaction commit. It may be invoked more than
>> twice if the do{ } while loop that waits for writes, performs more
>> than one iteration. For me, each invocation takes a lot of time
>> because of btrfs_start_delalloc_inodes() and
>> btrfs_wait_ordered_extents(). Given the fixes you have made (handling
>> each inode only once), is it still needed to call these functions
>> several times during the same commit?
>
> Calling it in the while loop is because we don't know how much time we need
> to wait for the end of the other transaction handle, so we flush the delalloc
> inodes instead of just waiting.
>
> Maybe we need not call btrfs_wait_ordered_extents() in the while loop, just
> call btrfs_start_delalloc_inodes() to start flush.
>
>> - I see that during a commit without pending snapshots, these two
>> functions are not invoked at all.
>>       if (flush_on_commit || snap_pending) {
>>               ret = btrfs_start_delalloc_inodes(root, 1);
>>               if (ret)
>>                       return ret;
>>               btrfs_wait_ordered_extents(root, 1);
>>       }
>> The FLUSHONCOMMIT mount option is normally *not set*, I see in the
>> wiki that it is "Not needed with recent kernels, as it's the normal
>> mode of operation. "
>> Can you pls explain why the delalloc is needed when there is a pending
>> snap, but not with a non-snap commit? Or pls point me to the code,
>> where I can better understand what delalloc inode is and how it is
>> related to FLUSHONCOMMIT or snapshot? Can you pls explain what the
>> FLUSHONCOMMIT mount option is about?
>>
>> I understand from your explanation that without flushing the delalloc,
>> the data in the snap may be different from the subvolume data. But
>> this may happen anyways, since the IO to the subvolume is not blocked
>> during transaction commit (at least it doesn't look that it's
>> blocked).
>
> There are two cases that the data in the snap may be different from the
> subvolume data without flushing the delalloc.
>
> 1st:
>         User0
>         write(fd, buf, len)
>         create snap
>
> 2nd:
>         User0                           User1
>         create snap
>           commit_transaction
>                                         write(fd, buf, len)
>             create_pending_snapshot
>
> For the 1st case, the user doesn't expect the data is different between source
> subvolume and the snapshot.
>
> For the 2nd case, it doesn't matter.
>
> So in order to avoid the 1st case, we need flush the delalloc before snapshot
> creation.
Thanks for clarifying, Miao.

>
>> - Is there something that user-space can do to avoid flushing the
>> delalloc during snap-commit? For example, if the user-space stops the
>> IO and does a normal commit, this will not call
>> btrfs_start_delalloc_inodes(), but this should ensure that *some* data
>> is safe on disk. Then the user-space triggers snap creation (which is
>> another commit), and then resume the IO? Because without the ongoing
>> IO, snap creation is very fast.
>
> We can sync the fs before creating snapshot, in this way, we needn't flush
> the delalloc while committing the transaction. But I don't think it is good
> way. Because for the users, the page cache is transparent.

I was always under impression that in Linux you must be aware of the
page cache. For exampe, if a C program writes to a file, it is also
responsible to fsync() the file (and check return value), to make sure
that data has been persisted. However, for snap creation, it is
perhaps better to do this automatically for the user.

>
> I have a idea to improve this problem:
I think the below idea is very good:

> - introduce per-subvolume delalloc inode list
Perfect.

> - flush the delalloc inode list of the source subvolume in create_snapshot(),
>   not flush it in commit_transaction(). In this way, we just flush once.
Perfect.

> - don't do the delalloc flush in commit_transaction() if there are pending snapshots
Perfect.

> - If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop,
>   and call btrfs_wait_ordered_extents() after the while loop.
In case FLUSHONCOMMIT is not set, but there are pending snapshots, is
it also needed to call btrfs_wait_ordered_extents(), since we have
started the delalloc IO in create_snapshot()?


I have run additional tests with your patchset. It definitely improves
the two bugs. However, I have a question: if the IO to the filesystem
continues during extent_write_cache_pages() loop, can it add more
dirty pages to the flush this function is doing? Basically, I am
looking for some way to separate "old" delallocs that must be flushed
vs "new" delallocs that were added after we started committing.

For example, with your current patches, the extent_write_cache_pages()
is called at least twice per inode (I know that your new idea will fix
it). In my case, the first call submitted 22671 pages (inode is 10Gb)
and the second call submitted 33705 pages. Between those two calls
there were 6531 pages that were submitted twice. I noticed that if I
stop the IO and then immediately trigger snap creation, much less
pages are submitted.

I played with the freeze_super() API, which also syncs the filesystem,
so delalloc flush is not needed in this case, like you mentioned.
However, the snap creation flow also calls mnt_want_write_file(),
which blocks if the FS is freezed.
Does it make sense to have some kind of a light-freeze functionality,
which would only block the new writers (and possibly wait for
in-flight writer threads to complete), but will not do the
sync_filesystem part, but only the SB_FREEZE_WRITE part?

Thanks for your help,
Alex.

>
> Thanks
> Miao
>
>> Thanks for your help,
>> Alex.
>>
>> P.S.: As I am writing this email, snap creation is stuck for 30
>> minutes, calling btrfs_flush_all_pending_stuffs() again and again....
>>
>>
>>
>>
>> On Wed, Jan 23, 2013 at 12:20 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>>> On Wed, 23 Jan 2013 17:52:14 +0800, Liu Bo wrote:
>>>> On Wed, Jan 23, 2013 at 04:58:27PM +0800, Miao Xie wrote:
>>>>> On wed, 23 Jan 2013 16:17:53 +0800, Liu Bo wrote:
>>>>>> On Wed, Jan 23, 2013 at 02:33:27PM +0800, Miao Xie wrote:
>>>>>>> On wed, 23 Jan 2013 14:06:21 +0800, Liu Bo wrote:
>>>>>>>> On Wed, Jan 23, 2013 at 12:44:49PM +0800, Miao Xie wrote:
>>>>>>>>> No, we can't. The other tasks which flush the delalloc data may remove the inode
>>>>>>>>> from the delalloc list/splice list. If we release the lock, we will meet the race
>>>>>>>>> between list traversing and list_del().
>>>>>>>>
>>>>>>>> OK, then please merge patch 1 and 4 so that we can backport 1 less patch
>>>>>>>> at least.
>>>>>>>
>>>>>>> I don't think we should merge these two patch because they do two different things - one
>>>>>>> is bug fix, and the other is just a improvement, and this improvement changes the logic
>>>>>>> of the code and might be argumentative for some developers. So 2 patches is  better than one,
>>>>>>> I think.
>>>>>>
>>>>>> Sorry, this is right only when patch 1 really fixes the problem Alex reported.
>>>>>>
>>>>>> But the fact is
>>>>>>
>>>>>> 1)  patch 1 is not enough to fix the bug, it just fixes the OOM of
>>>>>> allocating 'struct btrfs_delalloc_work' while the original OOM of allocating
>>>>>> requests remains.  We can still get the same inode over and over again
>>>>>> and then stuck in btrfs_start_delalloc_inodes() because we 'goto again' to
>>>>>> make sure we flush all inodes listed in fs_info->delalloc_inodes.
>>>>>>
>>>>>> 2)  patch 4 fixes 1)'s problems by removing 'goto again'.
>>>>>>
>>>>>> Am I missing something?
>>>>>
>>>>> In fact, there are two different problems.
>>>>> One is OOM bug. it is a serious problem and also is an regression, so we should fix it
>>>>> as soon as possible.
>>>>> The other one is that we may fetch the same inode again and again if someone write data
>>>>> into it endlessly. This problem is not so urgent to deal with. and perhaps some developers
>>>>> think it is not a problem, we should flush that inode since there are dirty pages in it.
>>>>> So we need split it from the patch of the 1st problem.
>>>>
>>>> All right, I'm ok with this.
>>>>
>>>> But the TWO different problems are both due to fetching the same inode more
>>>> than once, and the solutions are indeed same, "fetch every inode on
>>>> the list just once", and they are in the same code.
>>>
>>> I forgot to say that the first problem can happen even though no task writes data into
>>> the file after we start to flush the delalloc inodes.
>>>
>>> Thanks
>>> Miao
>>>
>>>>
>>>> It is definitely a bug/problem if any users complains about their box
>>>> getting stuck.  It is KERNEL that should be blamed.
>>>>
>>>> thanks,
>>>> liubo
>>>>
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-24 16:20                         ` Alex Lyakas
@ 2013-01-25  6:09                           ` Miao Xie
  2013-01-27 10:59                             ` Alex Lyakas
  0 siblings, 1 reply; 17+ messages in thread
From: Miao Xie @ 2013-01-25  6:09 UTC (permalink / raw)
  To: Alex Lyakas; +Cc: bo.li.liu, Linux Btrfs

On thu, 24 Jan 2013 18:20:06 +0200, Alex Lyakas wrote:
>>> - Is there something that user-space can do to avoid flushing the
>>> delalloc during snap-commit? For example, if the user-space stops the
>>> IO and does a normal commit, this will not call
>>> btrfs_start_delalloc_inodes(), but this should ensure that *some* data
>>> is safe on disk. Then the user-space triggers snap creation (which is
>>> another commit), and then resume the IO? Because without the ongoing
>>> IO, snap creation is very fast.
>>
>> We can sync the fs before creating snapshot, in this way, we needn't flush
>> the delalloc while committing the transaction. But I don't think it is good
>> way. Because for the users, the page cache is transparent.
> 
> I was always under impression that in Linux you must be aware of the
> page cache. For exampe, if a C program writes to a file, it is also
> responsible to fsync() the file (and check return value), to make sure
> that data has been persisted. However, for snap creation, it is
> perhaps better to do this automatically for the user.
> 
>>
>> I have a idea to improve this problem:
> I think the below idea is very good:
> 
>> - introduce per-subvolume delalloc inode list
> Perfect.
> 
>> - flush the delalloc inode list of the source subvolume in create_snapshot(),
>>   not flush it in commit_transaction(). In this way, we just flush once.
> Perfect.
> 
>> - don't do the delalloc flush in commit_transaction() if there are pending snapshots
> Perfect.
> 
>> - If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop,
>>   and call btrfs_wait_ordered_extents() after the while loop.
> In case FLUSHONCOMMIT is not set, but there are pending snapshots, is
> it also needed to call btrfs_wait_ordered_extents(), since we have
> started the delalloc IO in create_snapshot()?

Yes, because FLUSHONCOMMIT will flush all delalloc inodes, including the source subvolumes
and the others which are not being snapshoted.

> I have run additional tests with your patchset. It definitely improves
> the two bugs. However, I have a question: if the IO to the filesystem
> continues during extent_write_cache_pages() loop, can it add more
> dirty pages to the flush this function is doing? Basically, I am
> looking for some way to separate "old" delallocs that must be flushed
> vs "new" delallocs that were added after we started committing.

We can not add dirty pages into the extents which are under the disk IO, but
we can add dirty pages into the other extents which belong to the same file, but
is not under the disk IO.
(Some developers have discuss the issue(unstable page problem) that if we can
dirty the extent that is under the disk IO or not in the past. Their approach is
allocate a new page to store the new data. But some developers disagree this idea
because the disk become more and more fast, it is unnecessary, and if the free
memory is not enough, we still need wait the IO.)

I think flushing the delalloc inodes at the beginning of btrfs_commit_transaction()
is a simple way to separate "old" delallocs and "new" delallocs.

> For example, with your current patches, the extent_write_cache_pages()
> is called at least twice per inode (I know that your new idea will fix
> it). In my case, the first call submitted 22671 pages (inode is 10Gb)
> and the second call submitted 33705 pages. Between those two calls
> there were 6531 pages that were submitted twice. I noticed that if I
> stop the IO and then immediately trigger snap creation, much less
> pages are submitted.
> 
> I played with the freeze_super() API, which also syncs the filesystem,
> so delalloc flush is not needed in this case, like you mentioned.
> However, the snap creation flow also calls mnt_want_write_file(),
> which blocks if the FS is freezed.
> Does it make sense to have some kind of a light-freeze functionality,
> which would only block the new writers (and possibly wait for
> in-flight writer threads to complete), but will not do the
> sync_filesystem part, but only the SB_FREEZE_WRITE part?

I have implemented a light-freeze functionality for the R/W->R/O change
of the subvolume(The patch will send out later). But I don't think it is
necessary to introduce it to the snapshot creation.

Thanks
Miao

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] Btrfs: fix repeated delalloc work allocation
  2013-01-25  6:09                           ` Miao Xie
@ 2013-01-27 10:59                             ` Alex Lyakas
  0 siblings, 0 replies; 17+ messages in thread
From: Alex Lyakas @ 2013-01-27 10:59 UTC (permalink / raw)
  To: miaox; +Cc: bo.li.liu, Linux Btrfs

Hi Miao,
thank you for your comments.

After some additional testing, few more observations:

# I see that only the OOM-fix patch was included in "for-linus" pull
request, but the rest of the patches are in btrfs-next. I hope they
will also make it into "for-linus" soon.

# for now, I have commented out the call to
btrfs_flush_all_pending_stuffs() in the do { } while loop of
btrfs_commit_transaction(). For me it improves things, by taking less
iterations of the do { } while loop, and flushing delalloc only once.
Of course, your future fix for per-subvolume-delalloc-flush is better
to have.

# I have tried to implement a quick-and-dirty light-freeze
functionality, by simply setting sb->s_writers.frozen =
SB_FREEZE_WRITE, thus making new callers to btrfs_file_aio_write()
(and others) to block. It actually improves very significantly the
delalloc time. On the other hand, it blocks the host io (for me up to
20s sometimes), which is not sure if acceptable. I am looking forward
to see your light-freeze patch.

# I have increased the block device request queue
(/sys/block/<name>/queue/nr_requests). It also improves a little,
because now extent_write_cache_pages() completes immediately, because
now it does not have to wait for free requests. The waiting is then
pushed to btrfs_wait_ordered_extents().

Thanks again for your help,
Alex.


On Fri, Jan 25, 2013 at 8:09 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> On thu, 24 Jan 2013 18:20:06 +0200, Alex Lyakas wrote:
>>>> - Is there something that user-space can do to avoid flushing the
>>>> delalloc during snap-commit? For example, if the user-space stops the
>>>> IO and does a normal commit, this will not call
>>>> btrfs_start_delalloc_inodes(), but this should ensure that *some* data
>>>> is safe on disk. Then the user-space triggers snap creation (which is
>>>> another commit), and then resume the IO? Because without the ongoing
>>>> IO, snap creation is very fast.
>>>
>>> We can sync the fs before creating snapshot, in this way, we needn't flush
>>> the delalloc while committing the transaction. But I don't think it is good
>>> way. Because for the users, the page cache is transparent.
>>
>> I was always under impression that in Linux you must be aware of the
>> page cache. For exampe, if a C program writes to a file, it is also
>> responsible to fsync() the file (and check return value), to make sure
>> that data has been persisted. However, for snap creation, it is
>> perhaps better to do this automatically for the user.
>>
>>>
>>> I have a idea to improve this problem:
>> I think the below idea is very good:
>>
>>> - introduce per-subvolume delalloc inode list
>> Perfect.
>>
>>> - flush the delalloc inode list of the source subvolume in create_snapshot(),
>>>   not flush it in commit_transaction(). In this way, we just flush once.
>> Perfect.
>>
>>> - don't do the delalloc flush in commit_transaction() if there are pending snapshots
>> Perfect.
>>
>>> - If FLUSHONCOMMIT is set, just call btrfs_start_delalloc_inodes() before while loop,
>>>   and call btrfs_wait_ordered_extents() after the while loop.
>> In case FLUSHONCOMMIT is not set, but there are pending snapshots, is
>> it also needed to call btrfs_wait_ordered_extents(), since we have
>> started the delalloc IO in create_snapshot()?
>
> Yes, because FLUSHONCOMMIT will flush all delalloc inodes, including the source subvolumes
> and the others which are not being snapshoted.
>
>> I have run additional tests with your patchset. It definitely improves
>> the two bugs. However, I have a question: if the IO to the filesystem
>> continues during extent_write_cache_pages() loop, can it add more
>> dirty pages to the flush this function is doing? Basically, I am
>> looking for some way to separate "old" delallocs that must be flushed
>> vs "new" delallocs that were added after we started committing.
>
> We can not add dirty pages into the extents which are under the disk IO, but
> we can add dirty pages into the other extents which belong to the same file, but
> is not under the disk IO.
> (Some developers have discuss the issue(unstable page problem) that if we can
> dirty the extent that is under the disk IO or not in the past. Their approach is
> allocate a new page to store the new data. But some developers disagree this idea
> because the disk become more and more fast, it is unnecessary, and if the free
> memory is not enough, we still need wait the IO.)
>
> I think flushing the delalloc inodes at the beginning of btrfs_commit_transaction()
> is a simple way to separate "old" delallocs and "new" delallocs.
>
>> For example, with your current patches, the extent_write_cache_pages()
>> is called at least twice per inode (I know that your new idea will fix
>> it). In my case, the first call submitted 22671 pages (inode is 10Gb)
>> and the second call submitted 33705 pages. Between those two calls
>> there were 6531 pages that were submitted twice. I noticed that if I
>> stop the IO and then immediately trigger snap creation, much less
>> pages are submitted.
>>
>> I played with the freeze_super() API, which also syncs the filesystem,
>> so delalloc flush is not needed in this case, like you mentioned.
>> However, the snap creation flow also calls mnt_want_write_file(),
>> which blocks if the FS is freezed.
>> Does it make sense to have some kind of a light-freeze functionality,
>> which would only block the new writers (and possibly wait for
>> in-flight writer threads to complete), but will not do the
>> sync_filesystem part, but only the SB_FREEZE_WRITE part?
>
> I have implemented a light-freeze functionality for the R/W->R/O change
> of the subvolume(The patch will send out later). But I don't think it is
> necessary to introduce it to the snapshot creation.
>
> Thanks
> Miao

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2013-01-27 10:59 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-01-22 10:49 [PATCH 1/5] Btrfs: fix repeated delalloc work allocation Miao Xie
2013-01-22 14:24 ` Liu Bo
2013-01-23  2:54   ` Miao Xie
2013-01-23  3:56     ` Liu Bo
2013-01-23  4:44       ` Miao Xie
2013-01-23  6:06         ` Liu Bo
2013-01-23  6:33           ` Miao Xie
2013-01-23  8:17             ` Liu Bo
2013-01-23  8:58               ` Miao Xie
2013-01-23  9:21                 ` Alex Lyakas
2013-01-23  9:52                 ` Liu Bo
2013-01-23 10:20                   ` Miao Xie
2013-01-23 17:02                     ` Alex Lyakas
2013-01-24  2:14                       ` Miao Xie
2013-01-24 16:20                         ` Alex Lyakas
2013-01-25  6:09                           ` Miao Xie
2013-01-27 10:59                             ` Alex Lyakas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.