All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
@ 2017-12-26  7:55 Changwei Ge
  2017-12-27  8:47 ` piaojun
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Changwei Ge @ 2017-12-26  7:55 UTC (permalink / raw)
  To: ocfs2-devel

A crash issue was reported by John.
The call trace follows:
ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
dio_complete+0x19a/0x1a0
do_blockdev_direct_IO+0x19dd/0x1eb0
__blockdev_direct_IO+0x43/0x50
ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
generic_file_direct_write+0xb2/0x170
__generic_file_write_iter+0xc3/0x1b0
ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
__vfs_write+0xae/0xf0
vfs_write+0xb8/0x1b0
SyS_write+0x4f/0xb0
system_call_fastpath+0x16/0x75

The BUG code told that extent tree wants to grow but no metadata
was reserved ahead of time.
 From my investigation into this issue, the root cause it that although
enough metadata is not reserved, there should be enough for following use.
Rightmost extent is merged into its left one due to a certain times of
marking extent written. Because during marking extent written, we got many
physically continuous extents. At last, an empty extent showed up and the
rightmost path is removed from extent tree.

Add a new mechanism to reuse extent block cached in dealloc which were
just unlinked from extent tree to solve this crash issue.

Criteria is that during marking extents *written*, if extent rotation
and merging results in unlinking extent with growing extent tree later
without any metadata reserved ahead of time, try to reuse those extents
in dealloc in which deleted extents are cached.

Also, this patch addresses the issue John reported that ::dw_zero_count is
not calculated properly.

After applying this patch, the issue John reported was gone.
Thanks for the reproducer provided by John.
And this patch has passed ocfs2-test(29 cases) suite running by New H3C Group.

Reported-by: John Lightsey <john@nixnuts.net>
Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
Reviewed-by: Duan Zhang <zhang.duan@h3c.com>
---
  fs/ocfs2/alloc.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
  fs/ocfs2/alloc.h |   1 +
  fs/ocfs2/aops.c  |  14 ++++--
  3 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ab5105f..56aba96 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
  				     struct ocfs2_extent_rec *rec);
  static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
  static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+					struct ocfs2_extent_tree *et,
+					struct buffer_head **new_eb_bh,
+					int blk_cnt);
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
+
  static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
  	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
  	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
  	if (!obj)
  		obj = (void *)bh->b_data;
  	et->et_object = obj;
+	et->et_dealloc = NULL;
  
  	et->et_ops->eo_fill_root_el(et);
  	if (!et->et_ops->eo_fill_max_leaf_clusters)
@@ -1213,8 +1221,15 @@ static int ocfs2_add_branch(handle_t *handle,
  		goto bail;
  	}
  
-	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
-					   meta_ac, new_eb_bhs);
+	if (meta_ac) {
+		status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
+						   meta_ac, new_eb_bhs);
+	} else if (!ocfs2_is_dealloc_empty(et)) {
+		status = ocfs2_reuse_blk_from_dealloc(handle, et,
+						      new_eb_bhs, new_blocks);
+	} else {
+		BUG();
+	}
  	if (status < 0) {
  		mlog_errno(status);
  		goto bail;
@@ -1347,8 +1362,15 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
  	struct ocfs2_extent_list  *root_el;
  	struct ocfs2_extent_list  *eb_el;
  
-	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
-					   &new_eb_bh);
+	if (meta_ac) {
+		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+						   &new_eb_bh);
+	} else if (!ocfs2_is_dealloc_empty(et)) {
+		status = ocfs2_reuse_blk_from_dealloc(handle, et,
+						      &new_eb_bh, 1);
+	} else {
+		BUG();
+	}
  	if (status < 0) {
  		mlog_errno(status);
  		goto bail;
@@ -1511,7 +1533,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
  	int depth = le16_to_cpu(el->l_tree_depth);
  	struct buffer_head *bh = NULL;
  
-	BUG_ON(meta_ac == NULL);
+	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
  
  	shift = ocfs2_find_branch_target(et, &bh);
  	if (shift < 0) {
@@ -6562,7 +6584,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
  static struct ocfs2_per_slot_free_list *
  ocfs2_find_per_slot_free_list(int type,
  			      int slot,
-			      struct ocfs2_cached_dealloc_ctxt *ctxt)
+			      struct ocfs2_cached_dealloc_ctxt *ctxt, int new)
  {
  	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
  
@@ -6573,6 +6595,10 @@ ocfs2_find_per_slot_free_list(int type,
  		fl = fl->f_next_suballocator;
  	}
  
+	/* we didn't find any, just return NULL if _new_is not set. */
+	if (!new)
+		return NULL;
+
  	fl = kmalloc(sizeof(*fl), GFP_NOFS);
  	if (fl) {
  		fl->f_inode_type = type;
@@ -6585,6 +6611,106 @@ ocfs2_find_per_slot_free_list(int type,
  	return fl;
  }
  
+/* Return Value 1 indicates empty */
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_per_slot_free_list *fl;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+	if (!et->et_dealloc)
+		return 1;
+
+	fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
+					   osb->slot_num, et->et_dealloc, 0);
+
+	return fl ? 0 : 1;
+}
+
+/* If extent was deleted from tree due to extent rotation and merging, and
+ * no metadata is reserved ahead of time. Try to reuse some extents
+ * just deleted. This is only used to reuse extent blocks.
+ * It is supposed to find enough extent blocks in dealloc if our estimation
+ * on metadata is accurate.
+ */
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+					struct ocfs2_extent_tree *et,
+					struct buffer_head **new_eb_bh,
+					int blk_cnt)
+{
+	int i, status = -ENOSPC;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+	struct ocfs2_per_slot_free_list *fl;
+	struct ocfs2_cached_block_free *bf;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_super *osb =
+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+	dealloc = et->et_dealloc;
+	if (!dealloc)
+		goto bail;
+
+	for (i = 0; i < blk_cnt; i++) {
+		fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
+						   osb->slot_num, dealloc, 0);
+		/* The caller should guarantee that dealloc has cached some
+		 * extent blocks.
+		 */
+		BUG_ON(!fl);
+
+		bf = fl->f_first;
+		fl->f_first = bf->free_next;
+
+		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
+		if (new_eb_bh[i] == NULL) {
+			status = -ENOMEM;
+			mlog_errno(status);
+			goto bail;
+		}
+
+		mlog(0, "Reusing block(%llu) from dealloc\n", bf->free_blk);
+
+		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
+
+		status = ocfs2_journal_access_eb(handle, et->et_ci,
+						 new_eb_bh[i],
+						 OCFS2_JOURNAL_ACCESS_CREATE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
+		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
+
+		/* We can't guarantee that buffer head is still cached, so
+		 * polutlate the extent block again.
+		 */
+		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+		eb->h_blkno = cpu_to_le64(bf->free_blk);
+		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+		eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
+		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
+		eb->h_list.l_count =
+			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+		/* We'll also be dirtied by the caller, so
+		 * this isn't absolutely necessary.
+		 */
+		ocfs2_journal_dirty(handle, new_eb_bh[i]);
+
+		if (!fl->f_first) {
+			dealloc->c_first_suballocator = fl->f_next_suballocator;
+			kfree(fl);
+		}
+		kfree(bf);
+	}
+
+bail:
+	return status;
+}
+
  int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
  			      int type, int slot, u64 suballoc,
  			      u64 blkno, unsigned int bit)
@@ -6593,7 +6719,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
  	struct ocfs2_per_slot_free_list *fl;
  	struct ocfs2_cached_block_free *item;
  
-	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt, 1);
  	if (fl == NULL) {
  		ret = -ENOMEM;
  		mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 27b75cf..250bcac 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
  	ocfs2_journal_access_func		et_root_journal_access;
  	void					*et_object;
  	unsigned int				et_max_leaf_clusters;
+	struct ocfs2_cached_dealloc_ctxt	*et_dealloc;
  };
  
  /*
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index d151632..1bbd5c8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
  	struct ocfs2_cached_dealloc_ctxt w_dealloc;
  
  	struct list_head		w_unwritten_list;
+	unsigned int			w_unwritten_count;
  };
  
  void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1386,6 +1387,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
  	desc->c_clear_unwritten = 0;
  	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
  	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+	wc->w_unwritten_count++;
  	new = NULL;
  unlock:
  	spin_unlock(&oi->ip_lock);
@@ -1762,8 +1764,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
  		 */
  		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
  					      wc->w_di_bh);
-		ret = ocfs2_lock_allocators(inode, &et,
-					    clusters_to_alloc, extents_to_split,
+		ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc,
+					    2*extents_to_split,
  					    &data_ac, &meta_ac);
  		if (ret) {
  			mlog_errno(ret);
@@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
  		ue->ue_phys = desc->c_phys;
  
  		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
-		dwc->dw_zero_count++;
+		dwc->dw_zero_count += wc->w_unwritten_count;
  	}
  
  	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
@@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
  
  	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
  
+	/* Attach dealloc with extent tree in case that we may reuse extents
+	 * which are already unlinked from current extent tree due to extent
+	 * rotation and merging.
+	 */
+	et.et_dealloc = &dealloc;
+
  	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
  				    &data_ac, &meta_ac);
  	if (ret) {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-26  7:55 [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc Changwei Ge
@ 2017-12-27  8:47 ` piaojun
  2017-12-28  6:30   ` Changwei Ge
  2017-12-27 10:01 ` Junxiao Bi
  2017-12-28  6:21 ` alex chen
  2 siblings, 1 reply; 13+ messages in thread
From: piaojun @ 2017-12-27  8:47 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

On 2017/12/26 15:55, Changwei Ge wrote:
> A crash issue was reported by John.
> The call trace follows:
> ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
> ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
> ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
> ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
> dio_complete+0x19a/0x1a0
> do_blockdev_direct_IO+0x19dd/0x1eb0
> __blockdev_direct_IO+0x43/0x50
> ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
> generic_file_direct_write+0xb2/0x170
> __generic_file_write_iter+0xc3/0x1b0
> ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
> __vfs_write+0xae/0xf0
> vfs_write+0xb8/0x1b0
> SyS_write+0x4f/0xb0
> system_call_fastpath+0x16/0x75
> 
> The BUG code told that extent tree wants to grow but no metadata
> was reserved ahead of time.
>  From my investigation into this issue, the root cause it that although
> enough metadata is not reserved, there should be enough for following use.
> Rightmost extent is merged into its left one due to a certain times of
> marking extent written. Because during marking extent written, we got many
> physically continuous extents. At last, an empty extent showed up and the
> rightmost path is removed from extent tree.
> 
> Add a new mechanism to reuse extent block cached in dealloc which were
> just unlinked from extent tree to solve this crash issue.
> 
> Criteria is that during marking extents *written*, if extent rotation
> and merging results in unlinking extent with growing extent tree later
> without any metadata reserved ahead of time, try to reuse those extents
> in dealloc in which deleted extents are cached.
> 
> Also, this patch addresses the issue John reported that ::dw_zero_count is
> not calculated properly.

Does you patch solve two different problems? If so, I suggest split it
in two patch.

> 
> After applying this patch, the issue John reported was gone.
> Thanks for the reproducer provided by John.
> And this patch has passed ocfs2-test(29 cases) suite running by New H3C Group.
> 
> Reported-by: John Lightsey <john@nixnuts.net>
> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
> Reviewed-by: Duan Zhang <zhang.duan@h3c.com>
> ---
>   fs/ocfs2/alloc.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>   fs/ocfs2/alloc.h |   1 +
>   fs/ocfs2/aops.c  |  14 ++++--
>   3 files changed, 145 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..56aba96 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
>   				     struct ocfs2_extent_rec *rec);
>   static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
>   static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
> +
> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
> +					struct ocfs2_extent_tree *et,
> +					struct buffer_head **new_eb_bh,
> +					int blk_cnt);
> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
> +
>   static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
>   	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
>   	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
> @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
>   	if (!obj)
>   		obj = (void *)bh->b_data;
>   	et->et_object = obj;
> +	et->et_dealloc = NULL;

I wonder if there is necessity setting NULL here, as we will resign it
later.

thanks,
Jun

>   
>   	et->et_ops->eo_fill_root_el(et);
>   	if (!et->et_ops->eo_fill_max_leaf_clusters)
> @@ -1213,8 +1221,15 @@ static int ocfs2_add_branch(handle_t *handle,
>   		goto bail;
>   	}
>   
> -	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
> -					   meta_ac, new_eb_bhs);
> +	if (meta_ac) {
> +		status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
> +						   meta_ac, new_eb_bhs);
> +	} else if (!ocfs2_is_dealloc_empty(et)) {
> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
> +						      new_eb_bhs, new_blocks);
> +	} else {
> +		BUG();
> +	}
>   	if (status < 0) {
>   		mlog_errno(status);
>   		goto bail;
> @@ -1347,8 +1362,15 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
>   	struct ocfs2_extent_list  *root_el;
>   	struct ocfs2_extent_list  *eb_el;
>   
> -	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> -					   &new_eb_bh);
> +	if (meta_ac) {
> +		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> +						   &new_eb_bh);
> +	} else if (!ocfs2_is_dealloc_empty(et)) {
> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
> +						      &new_eb_bh, 1);
> +	} else {
> +		BUG();
> +	}
>   	if (status < 0) {
>   		mlog_errno(status);
>   		goto bail;
> @@ -1511,7 +1533,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
>   	int depth = le16_to_cpu(el->l_tree_depth);
>   	struct buffer_head *bh = NULL;
>   
> -	BUG_ON(meta_ac == NULL);
> +	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
>   
>   	shift = ocfs2_find_branch_target(et, &bh);
>   	if (shift < 0) {
> @@ -6562,7 +6584,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
>   static struct ocfs2_per_slot_free_list *
>   ocfs2_find_per_slot_free_list(int type,
>   			      int slot,
> -			      struct ocfs2_cached_dealloc_ctxt *ctxt)
> +			      struct ocfs2_cached_dealloc_ctxt *ctxt, int new)
>   {
>   	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
>   
> @@ -6573,6 +6595,10 @@ ocfs2_find_per_slot_free_list(int type,
>   		fl = fl->f_next_suballocator;
>   	}
>   
> +	/* we didn't find any, just return NULL if _new_is not set. */
> +	if (!new)
> +		return NULL;
> +
>   	fl = kmalloc(sizeof(*fl), GFP_NOFS);
>   	if (fl) {
>   		fl->f_inode_type = type;
> @@ -6585,6 +6611,106 @@ ocfs2_find_per_slot_free_list(int type,
>   	return fl;
>   }
>   
> +/* Return Value 1 indicates empty */
> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
> +{
> +	struct ocfs2_per_slot_free_list *fl;
> +	struct ocfs2_super *osb =
> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
> +
> +	if (!et->et_dealloc)
> +		return 1;
> +
> +	fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
> +					   osb->slot_num, et->et_dealloc, 0);
> +
> +	return fl ? 0 : 1;
> +}
> +
> +/* If extent was deleted from tree due to extent rotation and merging, and
> + * no metadata is reserved ahead of time. Try to reuse some extents
> + * just deleted. This is only used to reuse extent blocks.
> + * It is supposed to find enough extent blocks in dealloc if our estimation
> + * on metadata is accurate.
> + */
> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
> +					struct ocfs2_extent_tree *et,
> +					struct buffer_head **new_eb_bh,
> +					int blk_cnt)
> +{
> +	int i, status = -ENOSPC;
> +	struct ocfs2_cached_dealloc_ctxt *dealloc;
> +	struct ocfs2_per_slot_free_list *fl;
> +	struct ocfs2_cached_block_free *bf;
> +	struct ocfs2_extent_block *eb;
> +	struct ocfs2_super *osb =
> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
> +
> +	dealloc = et->et_dealloc;
> +	if (!dealloc)
> +		goto bail;
> +
> +	for (i = 0; i < blk_cnt; i++) {
> +		fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
> +						   osb->slot_num, dealloc, 0);
> +		/* The caller should guarantee that dealloc has cached some
> +		 * extent blocks.
> +		 */
> +		BUG_ON(!fl);
> +
> +		bf = fl->f_first;
> +		fl->f_first = bf->free_next;
> +
> +		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
> +		if (new_eb_bh[i] == NULL) {
> +			status = -ENOMEM;
> +			mlog_errno(status);
> +			goto bail;
> +		}
> +
> +		mlog(0, "Reusing block(%llu) from dealloc\n", bf->free_blk);
> +
> +		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
> +
> +		status = ocfs2_journal_access_eb(handle, et->et_ci,
> +						 new_eb_bh[i],
> +						 OCFS2_JOURNAL_ACCESS_CREATE);
> +		if (status < 0) {
> +			mlog_errno(status);
> +			goto bail;
> +		}
> +
> +		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
> +		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
> +
> +		/* We can't guarantee that buffer head is still cached, so
> +		 * polutlate the extent block again.
> +		 */
> +		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
> +		eb->h_blkno = cpu_to_le64(bf->free_blk);
> +		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
> +		eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
> +		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
> +		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
> +		eb->h_list.l_count =
> +			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
> +
> +		/* We'll also be dirtied by the caller, so
> +		 * this isn't absolutely necessary.
> +		 */
> +		ocfs2_journal_dirty(handle, new_eb_bh[i]);
> +
> +		if (!fl->f_first) {
> +			dealloc->c_first_suballocator = fl->f_next_suballocator;
> +			kfree(fl);
> +		}
> +		kfree(bf);
> +	}
> +
> +bail:
> +	return status;
> +}
> +
>   int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>   			      int type, int slot, u64 suballoc,
>   			      u64 blkno, unsigned int bit)
> @@ -6593,7 +6719,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>   	struct ocfs2_per_slot_free_list *fl;
>   	struct ocfs2_cached_block_free *item;
>   
> -	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
> +	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt, 1);
>   	if (fl == NULL) {
>   		ret = -ENOMEM;
>   		mlog_errno(ret);
> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
> index 27b75cf..250bcac 100644
> --- a/fs/ocfs2/alloc.h
> +++ b/fs/ocfs2/alloc.h
> @@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
>   	ocfs2_journal_access_func		et_root_journal_access;
>   	void					*et_object;
>   	unsigned int				et_max_leaf_clusters;
> +	struct ocfs2_cached_dealloc_ctxt	*et_dealloc;
>   };
>   
>   /*
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index d151632..1bbd5c8 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
>   	struct ocfs2_cached_dealloc_ctxt w_dealloc;
>   
>   	struct list_head		w_unwritten_list;
> +	unsigned int			w_unwritten_count;
>   };
>   
>   void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
> @@ -1386,6 +1387,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
>   	desc->c_clear_unwritten = 0;
>   	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
>   	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
> +	wc->w_unwritten_count++;
>   	new = NULL;
>   unlock:
>   	spin_unlock(&oi->ip_lock);
> @@ -1762,8 +1764,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>   		 */
>   		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
>   					      wc->w_di_bh);
> -		ret = ocfs2_lock_allocators(inode, &et,
> -					    clusters_to_alloc, extents_to_split,
> +		ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc,
> +					    2*extents_to_split,
>   					    &data_ac, &meta_ac);
>   		if (ret) {
>   			mlog_errno(ret);
> @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
>   		ue->ue_phys = desc->c_phys;
>   
>   		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
> -		dwc->dw_zero_count++;
> +		dwc->dw_zero_count += wc->w_unwritten_count;
>   	}
>   
>   	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
> @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>   
>   	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
>   
> +	/* Attach dealloc with extent tree in case that we may reuse extents
> +	 * which are already unlinked from current extent tree due to extent
> +	 * rotation and merging.
> +	 */
> +	et.et_dealloc = &dealloc;
> +
>   	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
>   				    &data_ac, &meta_ac);
>   	if (ret) {
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-26  7:55 [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc Changwei Ge
  2017-12-27  8:47 ` piaojun
@ 2017-12-27 10:01 ` Junxiao Bi
  2017-12-27 12:21   ` Changwei Ge
  2017-12-28  6:21 ` alex chen
  2 siblings, 1 reply; 13+ messages in thread
From: Junxiao Bi @ 2017-12-27 10:01 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,


On 12/26/2017 03:55 PM, Changwei Ge wrote:
> A crash issue was reported by John.
> The call trace follows:
> ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
> ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
> ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
> ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
> dio_complete+0x19a/0x1a0
> do_blockdev_direct_IO+0x19dd/0x1eb0
> __blockdev_direct_IO+0x43/0x50
> ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
> generic_file_direct_write+0xb2/0x170
> __generic_file_write_iter+0xc3/0x1b0
> ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
> __vfs_write+0xae/0xf0
> vfs_write+0xb8/0x1b0
> SyS_write+0x4f/0xb0
> system_call_fastpath+0x16/0x75
>
> The BUG code told that extent tree wants to grow but no metadata
> was reserved ahead of time.
>   From my investigation into this issue, the root cause it that although
> enough metadata is not reserved, there should be enough for following use.
> Rightmost extent is merged into its left one due to a certain times of
> marking extent written. Because during marking extent written, we got many
> physically continuous extents. At last, an empty extent showed up and the
> rightmost path is removed from extent tree.
I am trying to understand the issue. Quick questions.
Is this issue caused by BUG_ON(meta_ac == NULL)? Can you explain why it 
is NULL?

Thanks,
Junxiao.
>
> Add a new mechanism to reuse extent block cached in dealloc which were
> just unlinked from extent tree to solve this crash issue.
>
> Criteria is that during marking extents *written*, if extent rotation
> and merging results in unlinking extent with growing extent tree later
> without any metadata reserved ahead of time, try to reuse those extents
> in dealloc in which deleted extents are cached.
>
> Also, this patch addresses the issue John reported that ::dw_zero_count is
> not calculated properly.
>
> After applying this patch, the issue John reported was gone.
> Thanks for the reproducer provided by John.
> And this patch has passed ocfs2-test(29 cases) suite running by New H3C Group.
>
> Reported-by: John Lightsey <john@nixnuts.net>
> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
> Reviewed-by: Duan Zhang <zhang.duan@h3c.com>
> ---
>    fs/ocfs2/alloc.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>    fs/ocfs2/alloc.h |   1 +
>    fs/ocfs2/aops.c  |  14 ++++--
>    3 files changed, 145 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..56aba96 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
>    				     struct ocfs2_extent_rec *rec);
>    static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
>    static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
> +
> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
> +					struct ocfs2_extent_tree *et,
> +					struct buffer_head **new_eb_bh,
> +					int blk_cnt);
> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
> +
>    static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
>    	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
>    	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
> @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
>    	if (!obj)
>    		obj = (void *)bh->b_data;
>    	et->et_object = obj;
> +	et->et_dealloc = NULL;
>    
>    	et->et_ops->eo_fill_root_el(et);
>    	if (!et->et_ops->eo_fill_max_leaf_clusters)
> @@ -1213,8 +1221,15 @@ static int ocfs2_add_branch(handle_t *handle,
>    		goto bail;
>    	}
>    
> -	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
> -					   meta_ac, new_eb_bhs);
> +	if (meta_ac) {
> +		status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
> +						   meta_ac, new_eb_bhs);
> +	} else if (!ocfs2_is_dealloc_empty(et)) {
> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
> +						      new_eb_bhs, new_blocks);
> +	} else {
> +		BUG();
> +	}
>    	if (status < 0) {
>    		mlog_errno(status);
>    		goto bail;
> @@ -1347,8 +1362,15 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
>    	struct ocfs2_extent_list  *root_el;
>    	struct ocfs2_extent_list  *eb_el;
>    
> -	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> -					   &new_eb_bh);
> +	if (meta_ac) {
> +		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> +						   &new_eb_bh);
> +	} else if (!ocfs2_is_dealloc_empty(et)) {
> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
> +						      &new_eb_bh, 1);
> +	} else {
> +		BUG();
> +	}
>    	if (status < 0) {
>    		mlog_errno(status);
>    		goto bail;
> @@ -1511,7 +1533,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
>    	int depth = le16_to_cpu(el->l_tree_depth);
>    	struct buffer_head *bh = NULL;
>    
> -	BUG_ON(meta_ac == NULL);
> +	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
>    
>    	shift = ocfs2_find_branch_target(et, &bh);
>    	if (shift < 0) {
> @@ -6562,7 +6584,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
>    static struct ocfs2_per_slot_free_list *
>    ocfs2_find_per_slot_free_list(int type,
>    			      int slot,
> -			      struct ocfs2_cached_dealloc_ctxt *ctxt)
> +			      struct ocfs2_cached_dealloc_ctxt *ctxt, int new)
>    {
>    	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
>    
> @@ -6573,6 +6595,10 @@ ocfs2_find_per_slot_free_list(int type,
>    		fl = fl->f_next_suballocator;
>    	}
>    
> +	/* we didn't find any, just return NULL if _new_is not set. */
> +	if (!new)
> +		return NULL;
> +
>    	fl = kmalloc(sizeof(*fl), GFP_NOFS);
>    	if (fl) {
>    		fl->f_inode_type = type;
> @@ -6585,6 +6611,106 @@ ocfs2_find_per_slot_free_list(int type,
>    	return fl;
>    }
>    
> +/* Return Value 1 indicates empty */
> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
> +{
> +	struct ocfs2_per_slot_free_list *fl;
> +	struct ocfs2_super *osb =
> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
> +
> +	if (!et->et_dealloc)
> +		return 1;
> +
> +	fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
> +					   osb->slot_num, et->et_dealloc, 0);
> +
> +	return fl ? 0 : 1;
> +}
> +
> +/* If extent was deleted from tree due to extent rotation and merging, and
> + * no metadata is reserved ahead of time. Try to reuse some extents
> + * just deleted. This is only used to reuse extent blocks.
> + * It is supposed to find enough extent blocks in dealloc if our estimation
> + * on metadata is accurate.
> + */
> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
> +					struct ocfs2_extent_tree *et,
> +					struct buffer_head **new_eb_bh,
> +					int blk_cnt)
> +{
> +	int i, status = -ENOSPC;
> +	struct ocfs2_cached_dealloc_ctxt *dealloc;
> +	struct ocfs2_per_slot_free_list *fl;
> +	struct ocfs2_cached_block_free *bf;
> +	struct ocfs2_extent_block *eb;
> +	struct ocfs2_super *osb =
> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
> +
> +	dealloc = et->et_dealloc;
> +	if (!dealloc)
> +		goto bail;
> +
> +	for (i = 0; i < blk_cnt; i++) {
> +		fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
> +						   osb->slot_num, dealloc, 0);
> +		/* The caller should guarantee that dealloc has cached some
> +		 * extent blocks.
> +		 */
> +		BUG_ON(!fl);
> +
> +		bf = fl->f_first;
> +		fl->f_first = bf->free_next;
> +
> +		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
> +		if (new_eb_bh[i] == NULL) {
> +			status = -ENOMEM;
> +			mlog_errno(status);
> +			goto bail;
> +		}
> +
> +		mlog(0, "Reusing block(%llu) from dealloc\n", bf->free_blk);
> +
> +		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
> +
> +		status = ocfs2_journal_access_eb(handle, et->et_ci,
> +						 new_eb_bh[i],
> +						 OCFS2_JOURNAL_ACCESS_CREATE);
> +		if (status < 0) {
> +			mlog_errno(status);
> +			goto bail;
> +		}
> +
> +		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
> +		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
> +
> +		/* We can't guarantee that buffer head is still cached, so
> +		 * polutlate the extent block again.
> +		 */
> +		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
> +		eb->h_blkno = cpu_to_le64(bf->free_blk);
> +		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
> +		eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
> +		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
> +		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
> +		eb->h_list.l_count =
> +			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
> +
> +		/* We'll also be dirtied by the caller, so
> +		 * this isn't absolutely necessary.
> +		 */
> +		ocfs2_journal_dirty(handle, new_eb_bh[i]);
> +
> +		if (!fl->f_first) {
> +			dealloc->c_first_suballocator = fl->f_next_suballocator;
> +			kfree(fl);
> +		}
> +		kfree(bf);
> +	}
> +
> +bail:
> +	return status;
> +}
> +
>    int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>    			      int type, int slot, u64 suballoc,
>    			      u64 blkno, unsigned int bit)
> @@ -6593,7 +6719,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>    	struct ocfs2_per_slot_free_list *fl;
>    	struct ocfs2_cached_block_free *item;
>    
> -	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
> +	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt, 1);
>    	if (fl == NULL) {
>    		ret = -ENOMEM;
>    		mlog_errno(ret);
> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
> index 27b75cf..250bcac 100644
> --- a/fs/ocfs2/alloc.h
> +++ b/fs/ocfs2/alloc.h
> @@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
>    	ocfs2_journal_access_func		et_root_journal_access;
>    	void					*et_object;
>    	unsigned int				et_max_leaf_clusters;
> +	struct ocfs2_cached_dealloc_ctxt	*et_dealloc;
>    };
>    
>    /*
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index d151632..1bbd5c8 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
>    	struct ocfs2_cached_dealloc_ctxt w_dealloc;
>    
>    	struct list_head		w_unwritten_list;
> +	unsigned int			w_unwritten_count;
>    };
>    
>    void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
> @@ -1386,6 +1387,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
>    	desc->c_clear_unwritten = 0;
>    	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
>    	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
> +	wc->w_unwritten_count++;
>    	new = NULL;
>    unlock:
>    	spin_unlock(&oi->ip_lock);
> @@ -1762,8 +1764,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>    		 */
>    		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
>    					      wc->w_di_bh);
> -		ret = ocfs2_lock_allocators(inode, &et,
> -					    clusters_to_alloc, extents_to_split,
> +		ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc,
> +					    2*extents_to_split,
>    					    &data_ac, &meta_ac);
>    		if (ret) {
>    			mlog_errno(ret);
> @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
>    		ue->ue_phys = desc->c_phys;
>    
>    		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
> -		dwc->dw_zero_count++;
> +		dwc->dw_zero_count += wc->w_unwritten_count;
>    	}
>    
>    	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
> @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>    
>    	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
>    
> +	/* Attach dealloc with extent tree in case that we may reuse extents
> +	 * which are already unlinked from current extent tree due to extent
> +	 * rotation and merging.
> +	 */
> +	et.et_dealloc = &dealloc;
> +
>    	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
>    				    &data_ac, &meta_ac);
>    	if (ret) {

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-27 10:01 ` Junxiao Bi
@ 2017-12-27 12:21   ` Changwei Ge
  2018-01-02  8:17     ` Junxiao Bi
  0 siblings, 1 reply; 13+ messages in thread
From: Changwei Ge @ 2017-12-27 12:21 UTC (permalink / raw)
  To: ocfs2-devel

Hi Junxiao,

On 2017/12/27 18:02, Junxiao Bi wrote:
> Hi Changwei,
> 
> 
> On 12/26/2017 03:55 PM, Changwei Ge wrote:
>> A crash issue was reported by John.
>> The call trace follows:
>> ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
>> ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
>> ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
>> ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
>> dio_complete+0x19a/0x1a0
>> do_blockdev_direct_IO+0x19dd/0x1eb0
>> __blockdev_direct_IO+0x43/0x50
>> ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
>> generic_file_direct_write+0xb2/0x170
>> __generic_file_write_iter+0xc3/0x1b0
>> ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
>> __vfs_write+0xae/0xf0
>> vfs_write+0xb8/0x1b0
>> SyS_write+0x4f/0xb0
>> system_call_fastpath+0x16/0x75
>>
>> The BUG code told that extent tree wants to grow but no metadata
>> was reserved ahead of time.
>>    From my investigation into this issue, the root cause it that although
>> enough metadata is not reserved, there should be enough for following use.
>> Rightmost extent is merged into its left one due to a certain times of
>> marking extent written. Because during marking extent written, we got many
>> physically continuous extents. At last, an empty extent showed up and the
>> rightmost path is removed from extent tree.
> I am trying to understand the issue. Quick questions.
> Is this issue caused by BUG_ON(meta_ac == NULL)? Can you explain why it
> is NULL?

My pleasure to.
Before marking extents written, we have to estimate how many metadata will be used.
If there are enough metadata for following operation-marking extent written, no metadata
will be reserved ahead of time.

For this BUG scenario, it happens that extent tree already has enough free metadata.
This can be referred by code path:
ocfs2_dio_end_io_write
   ocfs2_lock_allocators - No need to reserve metadata,since extent tree already has more metadata than needed. So *mata_ac* is NULL.
   ocfs2_mark_extent_written
     ocfs2_change_extent_flag - cluster by cluster
       ocfs2_split_extent
         ocfs2_try_to_merge_extent - During filling file hole, we mark cluster as written one by one. Somehow, we merge those physically continuous cluster(WRITTEN) into a single one.
           ocfs2_rotate_tree_left - rotate extent
             __ocfs2_rotate_tree_left
               ocfs2_rotate_subtree_left - Aha, we find a totally empty extent here, so unlink it from extent tree.
                 ocfs2_unlink_subtree
                 ocfs2_remove_empty_extent
                 

Then, since we delete one extent block, our previously estimated metadata number is
pointless(NOTE, actually the estimation is accurate.). Since it is reduced resulted by extent
rotation and merging.
So for now, we don't have enough metadata.

Then we are still marking extent tree written for left clusters and we need to split extent.
But we don't have enough metadata to accommodate the split extent records.
So extent tree need to grow.
BUG. no metadata is reserved ahead of time.

Thanks,
Changwei
           

> 
> Thanks,
> Junxiao.
>>
>> Add a new mechanism to reuse extent block cached in dealloc which were
>> just unlinked from extent tree to solve this crash issue.
>>
>> Criteria is that during marking extents *written*, if extent rotation
>> and merging results in unlinking extent with growing extent tree later
>> without any metadata reserved ahead of time, try to reuse those extents
>> in dealloc in which deleted extents are cached.
>>
>> Also, this patch addresses the issue John reported that ::dw_zero_count is
>> not calculated properly.
>>
>> After applying this patch, the issue John reported was gone.
>> Thanks for the reproducer provided by John.
>> And this patch has passed ocfs2-test(29 cases) suite running by New H3C Group.
>>
>> Reported-by: John Lightsey <john@nixnuts.net>
>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>> Reviewed-by: Duan Zhang <zhang.duan@h3c.com>
>> ---
>>     fs/ocfs2/alloc.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>     fs/ocfs2/alloc.h |   1 +
>>     fs/ocfs2/aops.c  |  14 ++++--
>>     3 files changed, 145 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..56aba96 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
>>     				     struct ocfs2_extent_rec *rec);
>>     static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
>>     static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
>> +
>> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
>> +					struct ocfs2_extent_tree *et,
>> +					struct buffer_head **new_eb_bh,
>> +					int blk_cnt);
>> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
>> +
>>     static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
>>     	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
>>     	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
>> @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
>>     	if (!obj)
>>     		obj = (void *)bh->b_data;
>>     	et->et_object = obj;
>> +	et->et_dealloc = NULL;
>>     
>>     	et->et_ops->eo_fill_root_el(et);
>>     	if (!et->et_ops->eo_fill_max_leaf_clusters)
>> @@ -1213,8 +1221,15 @@ static int ocfs2_add_branch(handle_t *handle,
>>     		goto bail;
>>     	}
>>     
>> -	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
>> -					   meta_ac, new_eb_bhs);
>> +	if (meta_ac) {
>> +		status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
>> +						   meta_ac, new_eb_bhs);
>> +	} else if (!ocfs2_is_dealloc_empty(et)) {
>> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
>> +						      new_eb_bhs, new_blocks);
>> +	} else {
>> +		BUG();
>> +	}
>>     	if (status < 0) {
>>     		mlog_errno(status);
>>     		goto bail;
>> @@ -1347,8 +1362,15 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
>>     	struct ocfs2_extent_list  *root_el;
>>     	struct ocfs2_extent_list  *eb_el;
>>     
>> -	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
>> -					   &new_eb_bh);
>> +	if (meta_ac) {
>> +		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
>> +						   &new_eb_bh);
>> +	} else if (!ocfs2_is_dealloc_empty(et)) {
>> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
>> +						      &new_eb_bh, 1);
>> +	} else {
>> +		BUG();
>> +	}
>>     	if (status < 0) {
>>     		mlog_errno(status);
>>     		goto bail;
>> @@ -1511,7 +1533,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
>>     	int depth = le16_to_cpu(el->l_tree_depth);
>>     	struct buffer_head *bh = NULL;
>>     
>> -	BUG_ON(meta_ac == NULL);
>> +	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
>>     
>>     	shift = ocfs2_find_branch_target(et, &bh);
>>     	if (shift < 0) {
>> @@ -6562,7 +6584,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
>>     static struct ocfs2_per_slot_free_list *
>>     ocfs2_find_per_slot_free_list(int type,
>>     			      int slot,
>> -			      struct ocfs2_cached_dealloc_ctxt *ctxt)
>> +			      struct ocfs2_cached_dealloc_ctxt *ctxt, int new)
>>     {
>>     	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
>>     
>> @@ -6573,6 +6595,10 @@ ocfs2_find_per_slot_free_list(int type,
>>     		fl = fl->f_next_suballocator;
>>     	}
>>     
>> +	/* we didn't find any, just return NULL if _new_is not set. */
>> +	if (!new)
>> +		return NULL;
>> +
>>     	fl = kmalloc(sizeof(*fl), GFP_NOFS);
>>     	if (fl) {
>>     		fl->f_inode_type = type;
>> @@ -6585,6 +6611,106 @@ ocfs2_find_per_slot_free_list(int type,
>>     	return fl;
>>     }
>>     
>> +/* Return Value 1 indicates empty */
>> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
>> +{
>> +	struct ocfs2_per_slot_free_list *fl;
>> +	struct ocfs2_super *osb =
>> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
>> +
>> +	if (!et->et_dealloc)
>> +		return 1;
>> +
>> +	fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
>> +					   osb->slot_num, et->et_dealloc, 0);
>> +
>> +	return fl ? 0 : 1;
>> +}
>> +
>> +/* If extent was deleted from tree due to extent rotation and merging, and
>> + * no metadata is reserved ahead of time. Try to reuse some extents
>> + * just deleted. This is only used to reuse extent blocks.
>> + * It is supposed to find enough extent blocks in dealloc if our estimation
>> + * on metadata is accurate.
>> + */
>> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
>> +					struct ocfs2_extent_tree *et,
>> +					struct buffer_head **new_eb_bh,
>> +					int blk_cnt)
>> +{
>> +	int i, status = -ENOSPC;
>> +	struct ocfs2_cached_dealloc_ctxt *dealloc;
>> +	struct ocfs2_per_slot_free_list *fl;
>> +	struct ocfs2_cached_block_free *bf;
>> +	struct ocfs2_extent_block *eb;
>> +	struct ocfs2_super *osb =
>> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
>> +
>> +	dealloc = et->et_dealloc;
>> +	if (!dealloc)
>> +		goto bail;
>> +
>> +	for (i = 0; i < blk_cnt; i++) {
>> +		fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
>> +						   osb->slot_num, dealloc, 0);
>> +		/* The caller should guarantee that dealloc has cached some
>> +		 * extent blocks.
>> +		 */
>> +		BUG_ON(!fl);
>> +
>> +		bf = fl->f_first;
>> +		fl->f_first = bf->free_next;
>> +
>> +		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
>> +		if (new_eb_bh[i] == NULL) {
>> +			status = -ENOMEM;
>> +			mlog_errno(status);
>> +			goto bail;
>> +		}
>> +
>> +		mlog(0, "Reusing block(%llu) from dealloc\n", bf->free_blk);
>> +
>> +		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
>> +
>> +		status = ocfs2_journal_access_eb(handle, et->et_ci,
>> +						 new_eb_bh[i],
>> +						 OCFS2_JOURNAL_ACCESS_CREATE);
>> +		if (status < 0) {
>> +			mlog_errno(status);
>> +			goto bail;
>> +		}
>> +
>> +		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
>> +		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
>> +
>> +		/* We can't guarantee that buffer head is still cached, so
>> +		 * polutlate the extent block again.
>> +		 */
>> +		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
>> +		eb->h_blkno = cpu_to_le64(bf->free_blk);
>> +		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
>> +		eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
>> +		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
>> +		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
>> +		eb->h_list.l_count =
>> +			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
>> +
>> +		/* We'll also be dirtied by the caller, so
>> +		 * this isn't absolutely necessary.
>> +		 */
>> +		ocfs2_journal_dirty(handle, new_eb_bh[i]);
>> +
>> +		if (!fl->f_first) {
>> +			dealloc->c_first_suballocator = fl->f_next_suballocator;
>> +			kfree(fl);
>> +		}
>> +		kfree(bf);
>> +	}
>> +
>> +bail:
>> +	return status;
>> +}
>> +
>>     int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>>     			      int type, int slot, u64 suballoc,
>>     			      u64 blkno, unsigned int bit)
>> @@ -6593,7 +6719,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>>     	struct ocfs2_per_slot_free_list *fl;
>>     	struct ocfs2_cached_block_free *item;
>>     
>> -	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
>> +	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt, 1);
>>     	if (fl == NULL) {
>>     		ret = -ENOMEM;
>>     		mlog_errno(ret);
>> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
>> index 27b75cf..250bcac 100644
>> --- a/fs/ocfs2/alloc.h
>> +++ b/fs/ocfs2/alloc.h
>> @@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
>>     	ocfs2_journal_access_func		et_root_journal_access;
>>     	void					*et_object;
>>     	unsigned int				et_max_leaf_clusters;
>> +	struct ocfs2_cached_dealloc_ctxt	*et_dealloc;
>>     };
>>     
>>     /*
>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
>> index d151632..1bbd5c8 100644
>> --- a/fs/ocfs2/aops.c
>> +++ b/fs/ocfs2/aops.c
>> @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
>>     	struct ocfs2_cached_dealloc_ctxt w_dealloc;
>>     
>>     	struct list_head		w_unwritten_list;
>> +	unsigned int			w_unwritten_count;
>>     };
>>     
>>     void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
>> @@ -1386,6 +1387,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
>>     	desc->c_clear_unwritten = 0;
>>     	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
>>     	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
>> +	wc->w_unwritten_count++;
>>     	new = NULL;
>>     unlock:
>>     	spin_unlock(&oi->ip_lock);
>> @@ -1762,8 +1764,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>>     		 */
>>     		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
>>     					      wc->w_di_bh);
>> -		ret = ocfs2_lock_allocators(inode, &et,
>> -					    clusters_to_alloc, extents_to_split,
>> +		ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc,
>> +					    2*extents_to_split,
>>     					    &data_ac, &meta_ac);
>>     		if (ret) {
>>     			mlog_errno(ret);
>> @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
>>     		ue->ue_phys = desc->c_phys;
>>     
>>     		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
>> -		dwc->dw_zero_count++;
>> +		dwc->dw_zero_count += wc->w_unwritten_count;
>>     	}
>>     
>>     	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
>> @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>>     
>>     	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
>>     
>> +	/* Attach dealloc with extent tree in case that we may reuse extents
>> +	 * which are already unlinked from current extent tree due to extent
>> +	 * rotation and merging.
>> +	 */
>> +	et.et_dealloc = &dealloc;
>> +
>>     	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
>>     				    &data_ac, &meta_ac);
>>     	if (ret) {
> 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-26  7:55 [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc Changwei Ge
  2017-12-27  8:47 ` piaojun
  2017-12-27 10:01 ` Junxiao Bi
@ 2017-12-28  6:21 ` alex chen
  2017-12-28  6:38   ` Changwei Ge
  2 siblings, 1 reply; 13+ messages in thread
From: alex chen @ 2017-12-28  6:21 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

On 2017/12/26 15:55, Changwei Ge wrote:
> A crash issue was reported by John.
> The call trace follows:
> ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
> ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
> ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
> ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
> dio_complete+0x19a/0x1a0
> do_blockdev_direct_IO+0x19dd/0x1eb0
> __blockdev_direct_IO+0x43/0x50
> ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
> generic_file_direct_write+0xb2/0x170
> __generic_file_write_iter+0xc3/0x1b0
> ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
> __vfs_write+0xae/0xf0
> vfs_write+0xb8/0x1b0
> SyS_write+0x4f/0xb0
> system_call_fastpath+0x16/0x75
> 
> The BUG code told that extent tree wants to grow but no metadata
> was reserved ahead of time.
>  From my investigation into this issue, the root cause it that although
> enough metadata is not reserved, there should be enough for following use.
> Rightmost extent is merged into its left one due to a certain times of
> marking extent written. Because during marking extent written, we got many
> physically continuous extents. At last, an empty extent showed up and the
> rightmost path is removed from extent tree.
> 
> Add a new mechanism to reuse extent block cached in dealloc which were
> just unlinked from extent tree to solve this crash issue.
> 
> Criteria is that during marking extents *written*, if extent rotation
> and merging results in unlinking extent with growing extent tree later
> without any metadata reserved ahead of time, try to reuse those extents
> in dealloc in which deleted extents are cached.
> 
> Also, this patch addresses the issue John reported that ::dw_zero_count is
> not calculated properly.
> 
> After applying this patch, the issue John reported was gone.
> Thanks for the reproducer provided by John.
> And this patch has passed ocfs2-test(29 cases) suite running by New H3C Group.
> 
> Reported-by: John Lightsey <john@nixnuts.net>
> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
> Reviewed-by: Duan Zhang <zhang.duan@h3c.com>
> ---
>   fs/ocfs2/alloc.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>   fs/ocfs2/alloc.h |   1 +
>   fs/ocfs2/aops.c  |  14 ++++--
>   3 files changed, 145 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..56aba96 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
>   				     struct ocfs2_extent_rec *rec);
>   static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
>   static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
> +
> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
> +					struct ocfs2_extent_tree *et,
> +					struct buffer_head **new_eb_bh,
> +					int blk_cnt);
> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
> +
>   static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
>   	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
>   	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
> @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
>   	if (!obj)
>   		obj = (void *)bh->b_data;
>   	et->et_object = obj;
> +	et->et_dealloc = NULL;
>   
>   	et->et_ops->eo_fill_root_el(et);
>   	if (!et->et_ops->eo_fill_max_leaf_clusters)
> @@ -1213,8 +1221,15 @@ static int ocfs2_add_branch(handle_t *handle,
>   		goto bail;
>   	}
>   
> -	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
> -					   meta_ac, new_eb_bhs);
> +	if (meta_ac) {
> +		status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
> +						   meta_ac, new_eb_bhs);
> +	} else if (!ocfs2_is_dealloc_empty(et)) {
> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
> +						      new_eb_bhs, new_blocks);
> +	} else {
> +		BUG();
> +	}
>   	if (status < 0) {
>   		mlog_errno(status);
>   		goto bail;
> @@ -1347,8 +1362,15 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
>   	struct ocfs2_extent_list  *root_el;
>   	struct ocfs2_extent_list  *eb_el;
>   
> -	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> -					   &new_eb_bh);
> +	if (meta_ac) {
> +		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> +						   &new_eb_bh);
> +	} else if (!ocfs2_is_dealloc_empty(et)) {
> +		status = ocfs2_reuse_blk_from_dealloc(handle, et,
> +						      &new_eb_bh, 1);
> +	} else {
> +		BUG();
> +	}
>   	if (status < 0) {
>   		mlog_errno(status);
>   		goto bail;
> @@ -1511,7 +1533,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
>   	int depth = le16_to_cpu(el->l_tree_depth);
>   	struct buffer_head *bh = NULL;
>   
> -	BUG_ON(meta_ac == NULL);
> +	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
>   
>   	shift = ocfs2_find_branch_target(et, &bh);
>   	if (shift < 0) {
> @@ -6562,7 +6584,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
>   static struct ocfs2_per_slot_free_list *
>   ocfs2_find_per_slot_free_list(int type,
>   			      int slot,
> -			      struct ocfs2_cached_dealloc_ctxt *ctxt)
> +			      struct ocfs2_cached_dealloc_ctxt *ctxt, int new)
>   {
>   	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
>   
> @@ -6573,6 +6595,10 @@ ocfs2_find_per_slot_free_list(int type,
>   		fl = fl->f_next_suballocator;
>   	}
>   
> +	/* we didn't find any, just return NULL if _new_is not set. */
> +	if (!new)
> +		return NULL;
> +
>   	fl = kmalloc(sizeof(*fl), GFP_NOFS);
>   	if (fl) {
>   		fl->f_inode_type = type;
> @@ -6585,6 +6611,106 @@ ocfs2_find_per_slot_free_list(int type,
>   	return fl;
>   }
>   
> +/* Return Value 1 indicates empty */
> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
> +{
> +	struct ocfs2_per_slot_free_list *fl;
> +	struct ocfs2_super *osb =
> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
> +
> +	if (!et->et_dealloc)
> +		return 1;
> +
> +	fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
> +					   osb->slot_num, et->et_dealloc, 0);
> +
> +	return fl ? 0 : 1;
> +}
> +
> +/* If extent was deleted from tree due to extent rotation and merging, and
> + * no metadata is reserved ahead of time. Try to reuse some extents
> + * just deleted. This is only used to reuse extent blocks.
> + * It is supposed to find enough extent blocks in dealloc if our estimation
> + * on metadata is accurate.
> + */
> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
> +					struct ocfs2_extent_tree *et,
> +					struct buffer_head **new_eb_bh,
> +					int blk_cnt)
> +{
> +	int i, status = -ENOSPC;
> +	struct ocfs2_cached_dealloc_ctxt *dealloc;
> +	struct ocfs2_per_slot_free_list *fl;
> +	struct ocfs2_cached_block_free *bf;
> +	struct ocfs2_extent_block *eb;
> +	struct ocfs2_super *osb =
> +		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
> +
> +	dealloc = et->et_dealloc;
> +	if (!dealloc)
> +		goto bail;
> +
> +	for (i = 0; i < blk_cnt; i++) {
> +		fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
> +						   osb->slot_num, dealloc, 0);
> +		/* The caller should guarantee that dealloc has cached some
> +		 * extent blocks.
> +		 */
> +		BUG_ON(!fl);
> +
> +		bf = fl->f_first;
> +		fl->f_first = bf->free_next;
> +
> +		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
> +		if (new_eb_bh[i] == NULL) {
> +			status = -ENOMEM;
> +			mlog_errno(status);
> +			goto bail;
> +		}
> +
> +		mlog(0, "Reusing block(%llu) from dealloc\n", bf->free_blk);
> +
> +		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
> +
> +		status = ocfs2_journal_access_eb(handle, et->et_ci,
> +						 new_eb_bh[i],
> +						 OCFS2_JOURNAL_ACCESS_CREATE);
> +		if (status < 0) {
> +			mlog_errno(status);
> +			goto bail;
> +		}
> +
> +		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
> +		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
> +
> +		/* We can't guarantee that buffer head is still cached, so
> +		 * polutlate the extent block again.
> +		 */
> +		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
> +		eb->h_blkno = cpu_to_le64(bf->free_blk);
> +		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
> +		eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
> +		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
> +		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
> +		eb->h_list.l_count =
> +			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
> +
> +		/* We'll also be dirtied by the caller, so
> +		 * this isn't absolutely necessary.
> +		 */
> +		ocfs2_journal_dirty(handle, new_eb_bh[i]);
> +
> +		if (!fl->f_first) {
> +			dealloc->c_first_suballocator = fl->f_next_suballocator;
> +			kfree(fl);
> +		}
> +		kfree(bf);
> +	}
> +
> +bail:
> +	return status;
> +}
> +
>   int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>   			      int type, int slot, u64 suballoc,
>   			      u64 blkno, unsigned int bit)
> @@ -6593,7 +6719,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>   	struct ocfs2_per_slot_free_list *fl;
>   	struct ocfs2_cached_block_free *item;
>   
> -	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
> +	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt, 1);
>   	if (fl == NULL) {
>   		ret = -ENOMEM;
>   		mlog_errno(ret);
> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
> index 27b75cf..250bcac 100644
> --- a/fs/ocfs2/alloc.h
> +++ b/fs/ocfs2/alloc.h
> @@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
>   	ocfs2_journal_access_func		et_root_journal_access;
>   	void					*et_object;
>   	unsigned int				et_max_leaf_clusters;
> +	struct ocfs2_cached_dealloc_ctxt	*et_dealloc;
>   };
>   
>   /*
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index d151632..1bbd5c8 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
>   	struct ocfs2_cached_dealloc_ctxt w_dealloc;
>   
>   	struct list_head		w_unwritten_list;
> +	unsigned int			w_unwritten_count;
>   };
>   
>   void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
> @@ -1386,6 +1387,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
>   	desc->c_clear_unwritten = 0;
>   	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
>   	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
> +	wc->w_unwritten_count++;
>   	new = NULL;
>   unlock:
>   	spin_unlock(&oi->ip_lock);
> @@ -1762,8 +1764,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>   		 */
>   		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
>   					      wc->w_di_bh);
> -		ret = ocfs2_lock_allocators(inode, &et,
> -					    clusters_to_alloc, extents_to_split,
> +		ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc,
> +					    2*extents_to_split,
>   					    &data_ac, &meta_ac);
We has doubled extent number in ocfs2_populate_write_desc(), so here we don't need to double again.

Thanks,
Alex
>   		if (ret) {
>   			mlog_errno(ret);
> @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
>   		ue->ue_phys = desc->c_phys;
>   
>   		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
> -		dwc->dw_zero_count++;
> +		dwc->dw_zero_count += wc->w_unwritten_count;
>   	}
>   
>   	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
> @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>   
>   	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
>   
> +	/* Attach dealloc with extent tree in case that we may reuse extents
> +	 * which are already unlinked from current extent tree due to extent
> +	 * rotation and merging.
> +	 */
> +	et.et_dealloc = &dealloc;
> +
>   	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
>   				    &data_ac, &meta_ac);
>   	if (ret) {
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-27  8:47 ` piaojun
@ 2017-12-28  6:30   ` Changwei Ge
  0 siblings, 0 replies; 13+ messages in thread
From: Changwei Ge @ 2017-12-28  6:30 UTC (permalink / raw)
  To: ocfs2-devel

Hi Jun,

On 2017/12/27 16:48, piaojun wrote:
> Hi Changwei,
> 
> On 2017/12/26 15:55, Changwei Ge wrote:
>> A crash issue was reported by John.
>> The call trace follows:

>> without any metadata reserved ahead of time, try to reuse those extents
>> in dealloc in which deleted extents are cached.
>>
>> Also, this patch addresses the issue John reported that ::dw_zero_count is
>> not calculated properly.
> 
> Does you patch solve two different problems? If so, I suggest split it
> in two patch.

It might be considerable.

> 
>>
>> After applying this patch, the issue John reported was gone.
>> Thanks for the reproducer provided by John.
>> And this patch has passed ocfs2-test(29 cases) suite running by New H3C Group.
>>
ta;
>>    	et->et_object = obj;
>> +	et->et_dealloc = NULL;
> 
> I wonder if there is necessity setting NULL here, as we will resign it
> later.

I think it is necessary to evaluate ::et_dealloc to NULL here.
Since there are some other code patch also using this __ocfs2_init_extent_tree.
Besides, we need to use ::et_dealloc to judge if we can find some extent blocks
to reuse or if the dealloc is empty.

Thanks,
Changwei

> 
> thanks,
> Jun
> 
>>    

> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-28  6:21 ` alex chen
@ 2017-12-28  6:38   ` Changwei Ge
  2017-12-29 21:15     ` John Lightsey
  0 siblings, 1 reply; 13+ messages in thread
From: Changwei Ge @ 2017-12-28  6:38 UTC (permalink / raw)
  To: ocfs2-devel

Hi Alex,

On 2017/12/28 14:22, alex chen wrote:
> Hi Changwei,
> 
>> @@ -1347,8 +1362,15 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
>>    	struct ocfs2_extent_list  *root_el;

>> @@ -1762,8 +1764,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>>    		 */
>>    		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
>>    					      wc->w_di_bh);
>> -		ret = ocfs2_lock_allocators(inode, &et,
>> -					    clusters_to_alloc, extents_to_split,
>> +		ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc,
>> +					    2*extents_to_split,
>>    					    &data_ac, &meta_ac);
> We has doubled extent number in ocfs2_populate_write_desc(), so here we don't need to double again.

I think your comment here makes sense.
I will remove the multiplier 2 in my next version of this patch.

Thanks,
Changwei


> 
> Thanks,
> Alex

> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-28  6:38   ` Changwei Ge
@ 2017-12-29 21:15     ` John Lightsey
  2017-12-29 22:36       ` Ge Changwei
  0 siblings, 1 reply; 13+ messages in thread
From: John Lightsey @ 2017-12-29 21:15 UTC (permalink / raw)
  To: ocfs2-devel

On Thu, 2017-12-28 at 06:38 +0000, Changwei Ge wrote:
> Hi Alex,
> 
> On 2017/12/28 14:22, alex chen wrote:
> > We has doubled extent number in ocfs2_populate_write_desc(), so
> > here we don't need to double again.
> 
> I think your comment here makes sense.
> I will remove the multiplier 2 in my next version of this patch.
> 

This may have made it function in testing where it should have failed.

I backported this patch to Debian's 4.9.65 kernel today, removed the *2
multiplier, and hit a related assertion failure in this codepath:

ocfs2_mark_extent_written
 ocfs2_change_extent_flag
   ocfs2_split_extent
    ocfs2_split_and_insert
     ocfs2_grow_tree
      ocfs2_add_branch
       ocfs2_create_new_meta_bhs
        ocfs2_claim_metadata

The assertion failure in ocfs2_claim_metadata() was

BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));

My guess is that meta_ac was allocated at the beginning of the write,
then the tree was truncated during the write, so the actual allocation
is requesting more than was originally reserved.

It seems like the codepaths where meta_ac != NULL also need to know
about the list of extents available in et_dealloc.

The Perl code I used to hit this bug is attached.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: directio.pl
Type: application/x-perl
Size: 2145 bytes
Desc: not available
Url : http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20171229/396d8318/attachment.bin 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-29 21:15     ` John Lightsey
@ 2017-12-29 22:36       ` Ge Changwei
  2018-01-07 21:07         ` John Lightsey
  0 siblings, 1 reply; 13+ messages in thread
From: Ge Changwei @ 2017-12-29 22:36 UTC (permalink / raw)
  To: ocfs2-devel

Hi john,

Thanks for your test.
I am on a vacation until 3, January.
I will investigate the issue when I am back to work.

Thanks,
Changwei

Sent from my iPhone

> On 30 Dec 2017, at 5:17 AM, John Lightsey <john@nixnuts.net> wrote:
> 
>> On Thu, 2017-12-28 at 06:38 +0000, Changwei Ge wrote:
>> Hi Alex,
>> 
>>> On 2017/12/28 14:22, alex chen wrote:
>>> We has doubled extent number in ocfs2_populate_write_desc(), so
>>> here we don't need to double again.
>> 
>> I think your comment here makes sense.
>> I will remove the multiplier 2 in my next version of this patch.
>> 
> 
> This may have made it function in testing where it should have failed.
> 
> I backported this patch to Debian's 4.9.65 kernel today, removed the *2
> multiplier, and hit a related assertion failure in this codepath:
> 
> ocfs2_mark_extent_written
> ocfs2_change_extent_flag
>   ocfs2_split_extent
>    ocfs2_split_and_insert
>     ocfs2_grow_tree
>      ocfs2_add_branch
>       ocfs2_create_new_meta_bhs
>        ocfs2_claim_metadata
> 
> The assertion failure in ocfs2_claim_metadata() was
> 
> BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
> 
> My guess is that meta_ac was allocated at the beginning of the write,
> then the tree was truncated during the write, so the actual allocation
> is requesting more than was originally reserved.
> 
> It seems like the codepaths where meta_ac != NULL also need to know
> about the list of extents available in et_dealloc.
> 
> The Perl code I used to hit this bug is attached.
> <directio.pl>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-27 12:21   ` Changwei Ge
@ 2018-01-02  8:17     ` Junxiao Bi
  2018-01-03  2:27       ` Changwei Ge
  0 siblings, 1 reply; 13+ messages in thread
From: Junxiao Bi @ 2018-01-02  8:17 UTC (permalink / raw)
  To: ocfs2-devel

On 12/27/2017 08:21 PM, Changwei Ge wrote:
> Hi Junxiao,
> 
> On 2017/12/27 18:02, Junxiao Bi wrote:
>> Hi Changwei,
>>
>>
>> On 12/26/2017 03:55 PM, Changwei Ge wrote:
>>> A crash issue was reported by John.
>>> The call trace follows:
>>> ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
>>> ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
>>> ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
>>> ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
>>> dio_complete+0x19a/0x1a0
>>> do_blockdev_direct_IO+0x19dd/0x1eb0
>>> __blockdev_direct_IO+0x43/0x50
>>> ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
>>> generic_file_direct_write+0xb2/0x170
>>> __generic_file_write_iter+0xc3/0x1b0
>>> ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
>>> __vfs_write+0xae/0xf0
>>> vfs_write+0xb8/0x1b0
>>> SyS_write+0x4f/0xb0
>>> system_call_fastpath+0x16/0x75
>>>
>>> The BUG code told that extent tree wants to grow but no metadata
>>> was reserved ahead of time.
>>>    From my investigation into this issue, the root cause it that although
>>> enough metadata is not reserved, there should be enough for following use.
>>> Rightmost extent is merged into its left one due to a certain times of
>>> marking extent written. Because during marking extent written, we got many
>>> physically continuous extents. At last, an empty extent showed up and the
>>> rightmost path is removed from extent tree.
>> I am trying to understand the issue. Quick questions.
>> Is this issue caused by BUG_ON(meta_ac == NULL)? Can you explain why it
>> is NULL?
> My pleasure to.
> Before marking extents written, we have to estimate how many metadata will be used.
> If there are enough metadata for following operation-marking extent written, no metadata
> will be reserved ahead of time.
> 
> For this BUG scenario, it happens that extent tree already has enough free metadata.
> This can be referred by code path:
> ocfs2_dio_end_io_write
>    ocfs2_lock_allocators - No need to reserve metadata,since extent tree already has more metadata than needed. So *mata_ac* is NULL.
>    ocfs2_mark_extent_written
>      ocfs2_change_extent_flag - cluster by cluster
>        ocfs2_split_extent
>          ocfs2_try_to_merge_extent - During filling file hole, we mark cluster as written one by one. Somehow, we merge those physically continuous cluster(WRITTEN) into a single one.
>            ocfs2_rotate_tree_left - rotate extent
>              __ocfs2_rotate_tree_left
>                ocfs2_rotate_subtree_left - Aha, we find a totally empty extent here, so unlink it from extent tree.
>                  ocfs2_unlink_subtree
>                  ocfs2_remove_empty_extent
>                  
> 
> Then, since we delete one extent block, our previously estimated metadata number is
> pointless(NOTE, actually the estimation is accurate.). Since it is reduced resulted by extent
> rotation and merging.
> So for now, we don't have enough metadata.
See, thanks for your detailed explanation.

> 
> Then we are still marking extent tree written for left clusters and we need to split extent.
Once one ocfs2_mark_extent_written() caused an extent block removed,
then next invoke to it may not have enought metadata, right?

> But we don't have enough metadata to accommodate the split extent records.
If above is right, can we invoke ocfs2_lock_allocators() to calculate
enough metadata before every invoke to ocfs2_mark_extent_written()?
That will be more simple than fixing by reuse dealloc.

Thanks,
Junxiao.

> So extent tree need to grow.
> BUG. no metadata is reserved ahead of time.
> 
> Thanks,
> Changwei
>            
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2018-01-02  8:17     ` Junxiao Bi
@ 2018-01-03  2:27       ` Changwei Ge
  0 siblings, 0 replies; 13+ messages in thread
From: Changwei Ge @ 2018-01-03  2:27 UTC (permalink / raw)
  To: ocfs2-devel

Hi Junxiao,

On 2018/1/2 16:18, Junxiao Bi wrote:
> On 12/27/2017 08:21 PM, Changwei Ge wrote:
>> Hi Junxiao,
>>
>> On 2017/12/27 18:02, Junxiao Bi wrote:
>>> Hi Changwei,
>>>
>>>
>>> On 12/26/2017 03:55 PM, Changwei Ge wrote:
>>>> A crash issue was reported by John.
>>>> The call trace follows:
>>>> ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
>>>> ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
>>>> ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
>>>> ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
>>>> dio_complete+0x19a/0x1a0
>>>> do_blockdev_direct_IO+0x19dd/0x1eb0
>>>> __blockdev_direct_IO+0x43/0x50
>>>> ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
>>>> generic_file_direct_write+0xb2/0x170
>>>> __generic_file_write_iter+0xc3/0x1b0
>>>> ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
>>>> __vfs_write+0xae/0xf0
>>>> vfs_write+0xb8/0x1b0
>>>> SyS_write+0x4f/0xb0
>>>> system_call_fastpath+0x16/0x75
>>>>
>>>> The BUG code told that extent tree wants to grow but no metadata
>>>> was reserved ahead of time.
>>>>     From my investigation into this issue, the root cause it that although
>>>> enough metadata is not reserved, there should be enough for following use.
>>>> Rightmost extent is merged into its left one due to a certain times of
>>>> marking extent written. Because during marking extent written, we got many
>>>> physically continuous extents. At last, an empty extent showed up and the
>>>> rightmost path is removed from extent tree.
>>> I am trying to understand the issue. Quick questions.
>>> Is this issue caused by BUG_ON(meta_ac == NULL)? Can you explain why it
>>> is NULL?
>> My pleasure to.
>> Before marking extents written, we have to estimate how many metadata will be used.
>> If there are enough metadata for following operation-marking extent written, no metadata
>> will be reserved ahead of time.
>>
>> For this BUG scenario, it happens that extent tree already has enough free metadata.
>> This can be referred by code path:
>> ocfs2_dio_end_io_write
>>     ocfs2_lock_allocators - No need to reserve metadata,since extent tree already has more metadata than needed. So *mata_ac* is NULL.
>>     ocfs2_mark_extent_written
>>       ocfs2_change_extent_flag - cluster by cluster
>>         ocfs2_split_extent
>>           ocfs2_try_to_merge_extent - During filling file hole, we mark cluster as written one by one. Somehow, we merge those physically continuous cluster(WRITTEN) into a single one.
>>             ocfs2_rotate_tree_left - rotate extent
>>               __ocfs2_rotate_tree_left
>>                 ocfs2_rotate_subtree_left - Aha, we find a totally empty extent here, so unlink it from extent tree.
>>                   ocfs2_unlink_subtree
>>                   ocfs2_remove_empty_extent
>>                   
>>
>> Then, since we delete one extent block, our previously estimated metadata number is
>> pointless(NOTE, actually the estimation is accurate.). Since it is reduced resulted by extent
>> rotation and merging.
>> So for now, we don't have enough metadata.
> See, thanks for your detailed explanation.
> 
>>
>> Then we are still marking extent tree written for left clusters and we need to split extent.
> Once one ocfs2_mark_extent_written() caused an extent block removed,
> then next invoke to it may not have enought metadata, right?

Very true.

> 
>> But we don't have enough metadata to accommodate the split extent records.
> If above is right, can we invoke ocfs2_lock_allocators() to calculate
> enough metadata before every invoke to ocfs2_mark_extent_written()?
> That will be more simple than fixing by reuse dealloc.

That might be a way to fix this dio crash issue, too.
And Alex ever posted a patch to fix it like that way.

But I still prefer to fix it by reusing some extent blocks in dealloc.
I have a couple reasons:
1) Re-checking remaining metadata before invoking  ocfs2_mark_extent_written() will introduce
    overhead like wasting CPU cycles and more times accessing disk which will hurt ocfs2 performance very much.
    Specifically speaking, checking remaining metadata needs reload extent tree from disk. Allocating
    more extent block still need more time to modify _alloc_ on disk. But, actually, the space has already
    be allocated at the stage of _write_begin_.
2) That way makes estimating metadata pointless, which will make maintainer puzzle why we still need to estimate.
    IMO, we have to make the estimation robust and trusty.
3) It is hard to handle jbd2 part.

Moreover, I want to add a new common mechanism which can be used around similar scenario.

And I believe although my way is a little complicated but it is elegant.

That kind of problems can be solved once and for all.

Thanks,
Changwei

> 
> Thanks,
> Junxiao.
> 
>> So extent tree need to grow.
>> BUG. no metadata is reserved ahead of time.
>>
>> Thanks,
>> Changwei
>>             
>>
> 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2017-12-29 22:36       ` Ge Changwei
@ 2018-01-07 21:07         ` John Lightsey
  2018-01-08  0:29           ` Changwei Ge
  0 siblings, 1 reply; 13+ messages in thread
From: John Lightsey @ 2018-01-07 21:07 UTC (permalink / raw)
  To: ocfs2-devel

On Fri, 2017-12-29 at 22:36 +0000, Ge Changwei wrote:
> Hi john,
> 
> Thanks for your test.
> I am on a vacation until 3, January.
> I will investigate the issue when I am back to work.
> 

From what I can tell, the order of reclaiming blocs vs allocating new
ones just needs to be reversed for everything to work correctly.

This patch on top of yours works in my testing.

I'm not certain my test scenario is hitting ocfs2_add_branch() with
new_blocks > 1 and meta_ac != NULL though.


From 317cbd5a7c8ee5f8f6dff3844d23d3169db990a4 Mon Sep 17 00:00:00 2001
From: John Lightsey <john@nixnuts.net>
Date: Sun, 7 Jan 2018 14:43:21 -0600
Subject: [PATCH] Reuse deallocated extents before claiming new extents.

By reusing deallocated extents before attempting to claim new
extents, this avoids a condition where the extent tree is truncated,
and then an allocation requests are made for more extents than were
originally reserved.
---
 fs/ocfs2/alloc.c | 26 +++++++++++++++++++-------
 fs/ocfs2/aops.c  |  3 +--
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 89807ea7..dce2eaa7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1166,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle,
 			    struct buffer_head **last_eb_bh,
 			    struct ocfs2_alloc_context *meta_ac)
 {
-	int status, new_blocks, i;
+	int status, new_blocks, reclaimed_blocks, i;
 	u64 next_blkno, new_last_eb_blk;
 	struct buffer_head *bh;
 	struct buffer_head **new_eb_bhs = NULL;
@@ -1222,8 +1222,20 @@ static int ocfs2_add_branch(handle_t *handle,
 	}
 
 	if (meta_ac) {
-		status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
-						   meta_ac, new_eb_bhs);
+		reclaimed_blocks = 0;
+		while ( reclaimed_blocks < new_blocks && !ocfs2_is_dealloc_empty(et) ) {
+			status = ocfs2_reuse_blk_from_dealloc(handle, et,
+					new_eb_bhs + reclaimed_blocks, 1);
+			if (status < 0) {
+				mlog_errno(status);
+				goto bail;
+			}
+			reclaimed_blocks++;
+		}
+		if (reclaimed_blocks < new_blocks) {
+			status = ocfs2_create_new_meta_bhs(handle, et, new_blocks - reclaimed_blocks,
+				meta_ac, new_eb_bhs + reclaimed_blocks);
+		}
 	} else if (!ocfs2_is_dealloc_empty(et)) {
 		status = ocfs2_reuse_blk_from_dealloc(handle, et,
 						      new_eb_bhs, new_blocks);
@@ -1362,12 +1374,12 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
 	struct ocfs2_extent_list  *root_el;
 	struct ocfs2_extent_list  *eb_el;
 
-	if (meta_ac) {
-		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
-						   &new_eb_bh);
-	} else if (!ocfs2_is_dealloc_empty(et)) {
+	if (!ocfs2_is_dealloc_empty(et)) {
 		status = ocfs2_reuse_blk_from_dealloc(handle, et,
 						      &new_eb_bh, 1);
+	} else if (meta_ac) {
+		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+						   &new_eb_bh);
 	} else {
 		BUG();
 	}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index dcea6d5c..8ad37965 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1743,8 +1743,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
 					      wc->w_di_bh);
 		ret = ocfs2_lock_allocators(inode, &et,
-					    clusters_to_alloc,
-					    2*extents_to_split,
+					    clusters_to_alloc, extents_to_split,
 					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc
  2018-01-07 21:07         ` John Lightsey
@ 2018-01-08  0:29           ` Changwei Ge
  0 siblings, 0 replies; 13+ messages in thread
From: Changwei Ge @ 2018-01-08  0:29 UTC (permalink / raw)
  To: ocfs2-devel

Hi John,

Sorry for reply you so late since too busy these days.
Thanks for your contribution for this issue.

Thanks to the reproducer you provided, I have reproduced the crash issue you
reported.
The back trace was found.
ocfs2_mark_extent_written
  ocfs2_change_extent_flag
    ocfs2_split_extent
     ocfs2_split_and_insert
      ocfs2_grow_tree
       ocfs2_add_branch
        ocfs2_create_new_meta_bhs
         ocfs2_claim_metadata

I will post my v3 patch today.
My direction to fix above crash issue has something similar to yours.
What the difference is I revised ocfs2_reuse_blk_from_dealloc() adding a
new parameter to it.

Thanks,
Changwei

On 2018/1/8 5:08, John Lightsey wrote:
> On Fri, 2017-12-29 at 22:36 +0000, Ge Changwei wrote:
>> Hi john,
>>
>> Thanks for your test.
>> I am on a vacation until 3, January.
>> I will investigate the issue when I am back to work.
>>
> 
>  From what I can tell, the order of reclaiming blocs vs allocating new
> ones just needs to be reversed for everything to work correctly.
> 
> This patch on top of yours works in my testing.
> 
> I'm not certain my test scenario is hitting ocfs2_add_branch() with
> new_blocks > 1 and meta_ac != NULL though.
> 
> 
>  From 317cbd5a7c8ee5f8f6dff3844d23d3169db990a4 Mon Sep 17 00:00:00 2001
> From: John Lightsey <john@nixnuts.net>
> Date: Sun, 7 Jan 2018 14:43:21 -0600
> Subject: [PATCH] Reuse deallocated extents before claiming new extents.
> 
> By reusing deallocated extents before attempting to claim new
> extents, this avoids a condition where the extent tree is truncated,
> and then an allocation requests are made for more extents than were
> originally reserved.
> ---
>   fs/ocfs2/alloc.c | 26 +++++++++++++++++++-------
>   fs/ocfs2/aops.c  |  3 +--
>   2 files changed, 20 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index 89807ea7..dce2eaa7 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -1166,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle,
>   			    struct buffer_head **last_eb_bh,
>   			    struct ocfs2_alloc_context *meta_ac)
>   {
> -	int status, new_blocks, i;
> +	int status, new_blocks, reclaimed_blocks, i;
>   	u64 next_blkno, new_last_eb_blk;
>   	struct buffer_head *bh;
>   	struct buffer_head **new_eb_bhs = NULL;
> @@ -1222,8 +1222,20 @@ static int ocfs2_add_branch(handle_t *handle,
>   	}
>   
>   	if (meta_ac) {
> -		status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
> -						   meta_ac, new_eb_bhs);
> +		reclaimed_blocks = 0;
> +		while ( reclaimed_blocks < new_blocks && !ocfs2_is_dealloc_empty(et) ) {
> +			status = ocfs2_reuse_blk_from_dealloc(handle, et,
> +					new_eb_bhs + reclaimed_blocks, 1);
> +			if (status < 0) {
> +				mlog_errno(status);
> +				goto bail;
> +			}
> +			reclaimed_blocks++;
> +		}
> +		if (reclaimed_blocks < new_blocks) {
> +			status = ocfs2_create_new_meta_bhs(handle, et, new_blocks - reclaimed_blocks,
> +				meta_ac, new_eb_bhs + reclaimed_blocks);
> +		}
>   	} else if (!ocfs2_is_dealloc_empty(et)) {
>   		status = ocfs2_reuse_blk_from_dealloc(handle, et,
>   						      new_eb_bhs, new_blocks);
> @@ -1362,12 +1374,12 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
>   	struct ocfs2_extent_list  *root_el;
>   	struct ocfs2_extent_list  *eb_el;
>   
> -	if (meta_ac) {
> -		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> -						   &new_eb_bh);
> -	} else if (!ocfs2_is_dealloc_empty(et)) {
> +	if (!ocfs2_is_dealloc_empty(et)) {
>   		status = ocfs2_reuse_blk_from_dealloc(handle, et,
>   						      &new_eb_bh, 1);
> +	} else if (meta_ac) {
> +		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
> +						   &new_eb_bh);
>   	} else {
>   		BUG();
>   	}
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index dcea6d5c..8ad37965 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -1743,8 +1743,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>   		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
>   					      wc->w_di_bh);
>   		ret = ocfs2_lock_allocators(inode, &et,
> -					    clusters_to_alloc,
> -					    2*extents_to_split,
> +					    clusters_to_alloc, extents_to_split,
>   					    &data_ac, &meta_ac);
>   		if (ret) {
>   			mlog_errno(ret);
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-01-08  0:29 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-26  7:55 [Ocfs2-devel] [PATCH v2] ocfs2: try to reuse extent block in dealloc without meta_alloc Changwei Ge
2017-12-27  8:47 ` piaojun
2017-12-28  6:30   ` Changwei Ge
2017-12-27 10:01 ` Junxiao Bi
2017-12-27 12:21   ` Changwei Ge
2018-01-02  8:17     ` Junxiao Bi
2018-01-03  2:27       ` Changwei Ge
2017-12-28  6:21 ` alex chen
2017-12-28  6:38   ` Changwei Ge
2017-12-29 21:15     ` John Lightsey
2017-12-29 22:36       ` Ge Changwei
2018-01-07 21:07         ` John Lightsey
2018-01-08  0:29           ` Changwei Ge

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.