All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] [PATCH 01/41] ocfs2: Define refcount tree structure.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 02/41] ocfs2: Add metaecc for ocfs2_refcount_block Tao Ma
                   ` (40 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/ocfs2_fs.h |   88 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7ab6e9e..e4288b4 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -68,6 +68,7 @@
 #define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
 #define OCFS2_DX_ROOT_SIGNATURE		"DXDIR01"
 #define OCFS2_DX_LEAF_SIGNATURE		"DXLEAF1"
+#define OCFS2_REFCOUNT_BLOCK_SIGNATURE	"REFCNT1"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -160,6 +161,9 @@
 /* Metadata checksum and error correction */
 #define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
 
+/* Refcount tree support */
+#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE	0x1000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -223,6 +227,7 @@
 #define OCFS2_HAS_XATTR_FL	(0x0002)
 #define OCFS2_INLINE_XATTR_FL	(0x0004)
 #define OCFS2_INDEXED_DIR_FL	(0x0008)
+#define OCFS2_HAS_REFCOUNT_FL   (0x0010)
 
 /* Inode attributes, keep in sync with EXT2 */
 #define OCFS2_SECRM_FL		(0x00000001)	/* Secure deletion */
@@ -241,8 +246,11 @@
 /*
  * Extent record flags (e_node.leaf.flags)
  */
-#define OCFS2_EXT_UNWRITTEN	(0x01)	/* Extent is allocated but
-					 * unwritten */
+#define OCFS2_EXT_UNWRITTEN		(0x01)	/* Extent is allocated but
+						 * unwritten */
+#define OCFS2_EXT_REFCOUNTED		(0x02)  /* Extent is reference
+						 * counted in an associated
+						 * refcount tree */
 
 /*
  * ioctl commands
@@ -717,7 +725,8 @@ struct ocfs2_dinode {
 	__le64 i_xattr_loc;
 /*80*/	struct ocfs2_block_check i_check;	/* Error checking */
 /*88*/	__le64 i_dx_root;		/* Pointer to dir index root block */
-	__le64 i_reserved2[5];
+/*90*/	__le64 i_refcount_loc;
+	__le64 i_reserved2[4];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -901,6 +910,59 @@ struct ocfs2_group_desc
 /*40*/	__u8    bg_bitmap[0];
 };
 
+struct ocfs2_refcount_rec {
+/*00*/	__le64 r_cpos;		/* Physical offset, in clusters */
+	__le32 r_clusters;	/* Clusters covered by this extent */
+	__le32 r_refcount;	/* Reference count of this extent */
+/*10*/
+};
+
+#define OCFS2_REFCOUNT_LEAF_FL          (0x00000001)
+#define OCFS2_REFCOUNT_TREE_FL          (0x00000002)
+
+struct ocfs2_refcount_list {
+/*00*/	__le16 rl_count;	/* Maximum number of entries possible
+				   in rl_records */
+	__le16 rl_used;		/* Current number of used records */
+	__le32 rl_reserved2;
+	__le64 rl_reserved1;	/* Pad to sizeof(ocfs2_refcount_record) */
+/*10*/	struct ocfs2_refcount_rec rl_recs[0];	/* Refcount records */
+};
+
+
+struct ocfs2_refcount_block {
+/*00*/	__u8 rf_signature[8];		/* Signature for verification */
+	__le16 rf_suballoc_slot;	/* Slot suballocator this block
+					   belongs to */
+	__le16 rf_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 rf_fs_generation;	/* Must match superblock */
+/*10*/	__le64 rf_blkno;		/* Offset on disk, in blocks */
+	__le64 rf_parent;		/* Parent block, only valid if
+					   OCFS2_REFCOUNT_LEAF_FL is set in
+					   rf_flags */
+/*20*/	struct ocfs2_block_check rf_check;	/* Error checking */
+	__le64 rf_last_eb_blk;		/* Pointer to last extent block */
+/*30*/	__le32 rf_count;		/* Number of inodes sharing this
+					   refcount tree */
+	__le32 rf_flags;		/* See the flags above */
+	__le32 rf_clusters;		/* clusters covered by refcount tree. */
+	__le32 rf_cpos;			/* cluster offset in refcount tree.*/
+/*40*/	__le32 rf_generation;		/* generation number. all be the same
+					 * for the same refcount tree. */
+	__le32 rf_reserved0;
+	__le64 rf_reserved1[7];
+/*80*/	union {
+		struct ocfs2_refcount_list rf_records;  /* List of refcount
+							  records */
+		struct ocfs2_extent_list rf_list;	/* Extent record list,
+							only valid if
+							OCFS2_REFCOUNT_TREE_FL
+							is set in rf_flags */
+	};
+/* Actual on-disk size is one block */
+};
+
 /*
  * On disk extended attribute structure for OCFS2.
  */
@@ -1312,6 +1374,26 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
 
 	return size / sizeof(struct ocfs2_extent_rec);
 }
+
+static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
+
+	return size / sizeof(struct ocfs2_refcount_rec);
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 02/41] ocfs2: Add metaecc for ocfs2_refcount_block.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 01/41] ocfs2: Define refcount tree structure Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 03/41] ocfs2: Add ocfs2_read_refcount_block Tao Ma
                   ` (39 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Add metaecc and journal trigger for ocfs2_refcount_block.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/journal.c |   15 +++++++++++++++
 fs/ocfs2/journal.h |    3 +++
 2 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 5b6c0e4..54c16b6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -555,6 +555,14 @@ static struct ocfs2_triggers eb_triggers = {
 	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
 };
 
+static struct ocfs2_triggers rb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_refcount_block, rf_check),
+};
+
 static struct ocfs2_triggers gd_triggers = {
 	.ot_triggers = {
 		.t_commit = ocfs2_commit_trigger,
@@ -677,6 +685,13 @@ int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
 	return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
 }
 
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+				      type);
+}
+
 int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
 			    struct buffer_head *bh, int type)
 {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 6163f28..b2dc125 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -281,6 +281,9 @@ int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
 /* ocfs2_extent_block */
 int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
 			    struct buffer_head *bh, int type);
+/* ocfs2_refcount_block */
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
 /* ocfs2_group_desc */
 int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
 			    struct buffer_head *bh, int type);
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 03/41] ocfs2: Add ocfs2_read_refcount_block.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 01/41] ocfs2: Define refcount tree structure Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 02/41] ocfs2: Add metaecc for ocfs2_refcount_block Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 04/41] ocfs2: Abstract caching info checkpoint Tao Ma
                   ` (38 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/Makefile          |    1 +
 fs/ocfs2/cluster/masklog.c |    1 +
 fs/ocfs2/cluster/masklog.h |    1 +
 fs/ocfs2/ocfs2.h           |    3 +
 fs/ocfs2/refcounttree.c    |   99 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 105 insertions(+), 0 deletions(-)
 create mode 100644 fs/ocfs2/refcounttree.c

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 0159607..31f25ce 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -28,6 +28,7 @@ ocfs2-objs := \
 	locks.o			\
 	mmap.o 			\
 	namei.o 		\
+	refcounttree.o		\
 	resize.o		\
 	slot_map.o 		\
 	suballoc.o 		\
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 96df541..1cd2934 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(EXPORT),
 	define_mask(XATTR),
 	define_mask(QUOTA),
+	define_mask(REFCOUNT),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 696c32e..9b4d117 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
+#define ML_REFCOUNT	0x0000000080000000ULL /* refcount tree operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d370262..6688d19 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -610,6 +610,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DX_LEAF(ptr)					\
 	(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
 
+#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr)				\
+	(!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 0000000..a923535
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,99 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.c
+ *
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#define MLOG_MASK_PREFIX ML_REFCOUNT
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "suballoc.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "blockcheck.h"
+
+static int ocfs2_validate_refcount_block(struct super_block *sb,
+					 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)bh->b_data;
+
+	mlog(0, "Validating refcount block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return rc;
+	}
+
+
+	if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    rb->rf_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid rf_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(rb->rf_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid "
+			    "rf_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(rb->rf_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
+				     u64 rb_blkno,
+				     struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(ci, rb_blkno, &tmp,
+			      ocfs2_validate_refcount_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 04/41] ocfs2: Abstract caching info checkpoint.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (2 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 03/41] ocfs2: Add ocfs2_read_refcount_block Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 05/41] ocfs2: Add new refcount tree lock resource in dlmglue Tao Ma
                   ` (37 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In meta downconvert, we need to checkpoint the metadata in an inode.
For refcount tree, we also need it. So abstract the process out.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/dlmglue.c |   18 +++++++++++++-----
 1 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f518d1b..79db055 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3495,11 +3495,11 @@ out:
 	return UNBLOCK_CONTINUE;
 }
 
-static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
-					int new_level)
+static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
+				 struct ocfs2_lock_res *lockres,
+				 int new_level)
 {
-	struct inode *inode = ocfs2_lock_res_inode(lockres);
-	int checkpointed = ocfs2_ci_fully_checkpointed(INODE_CACHE(inode));
+	int checkpointed = ocfs2_ci_fully_checkpointed(ci);
 
 	BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
 	BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
@@ -3507,10 +3507,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
 	if (checkpointed)
 		return 1;
 
-	ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb));
+	ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
 	return 0;
 }
 
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+					int new_level)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
+}
+
 static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
 {
 	struct inode *inode = ocfs2_lock_res_inode(lockres);
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 05/41] ocfs2: Add new refcount tree lock resource in dlmglue.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (3 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 04/41] ocfs2: Abstract caching info checkpoint Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 06/41] ocfs2: Add caching info for refcount tree Tao Ma
                   ` (36 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

refcount tree lock resource is used to protect refcount
tree read/write among multiple nodes.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/dlmglue.c      |   80 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmglue.h      |    6 +++
 fs/ocfs2/ocfs2_lockid.h |    5 +++
 fs/ocfs2/refcounttree.h |   36 +++++++++++++++++++++
 4 files changed, 127 insertions(+), 0 deletions(-)
 create mode 100644 fs/ocfs2/refcounttree.h

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 79db055..bb2fc69 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
 #include "super.h"
 #include "uptodate.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -110,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
 
 static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level);
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking);
+
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
 
 /* This aids in debugging situations where a bad LVB might be involved. */
@@ -278,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
 	.flags		= LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+	.check_downconvert = ocfs2_check_refcount_downconvert,
+	.downconvert_worker = ocfs2_refcount_convert_worker,
+	.flags		= 0,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -306,6 +318,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re
 	return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
 }
 
+static inline struct ocfs2_refcount_tree *
+ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
+{
+	return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
+}
+
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
 	if (lockres->l_ops->get_osb)
@@ -693,6 +711,17 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
 				   info);
 }
 
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno,
+				  unsigned int generation)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
+			      generation, lockres->l_name);
+	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
+				   &ocfs2_refcount_block_lops, osb);
+}
+
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
 	mlog_entry_void();
@@ -3648,6 +3677,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 	return UNBLOCK_CONTINUE_POST;
 }
 
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level)
+{
+	struct ocfs2_refcount_tree *tree =
+				ocfs2_lock_res_refcount_tree(lockres);
+
+	return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
+}
+
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking)
+{
+	struct ocfs2_refcount_tree *tree =
+				ocfs2_lock_res_refcount_tree(lockres);
+
+	ocfs2_metadata_cache_purge(&tree->rf_ci);
+
+	return UNBLOCK_CONTINUE;
+}
+
 static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
 {
 	struct ocfs2_qinfo_lvb *lvb;
@@ -3760,6 +3809,37 @@ bail:
 	return status;
 }
 
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+	int status;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
 /*
  * This is the filesystem locking protocol.  It provides the lock handling
  * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 7553836..d1ce48e 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -101,6 +101,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 struct ocfs2_mem_dqinfo;
 void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
                                struct ocfs2_mem_dqinfo *info);
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno,
+				  unsigned int generation);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -148,6 +151,9 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
 int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+struct ocfs2_refcount_tree;
+int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
+void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
 
 
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index fcdba09..c746087 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -49,6 +49,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_QINFO,
 	OCFS2_LOCK_TYPE_NFS_SYNC,
 	OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+	OCFS2_LOCK_TYPE_REFCOUNT,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -89,6 +90,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
 			c = 'P';
 			break;
+		case OCFS2_LOCK_TYPE_REFCOUNT:
+			c = 'T';
+			break;
 		default:
 			c = '\0';
 	}
@@ -109,6 +113,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
 	[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
+	[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 0000000..9a3695c
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.h
+ *
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_REFCOUNTTREE_H
+#define OCFS2_REFCOUNTTREE_H
+
+struct ocfs2_refcount_tree {
+	struct rb_node rf_node;
+	u64 rf_blkno;
+	u32 rf_generation;
+	struct rw_semaphore rf_sem;
+	struct ocfs2_lock_res rf_lockres;
+	struct kref rf_getcnt;
+	int rf_removed;
+
+	/* the following 4 fields are used by caching_info. */
+	struct ocfs2_caching_info rf_ci;
+	spinlock_t rf_lock;
+	struct mutex rf_io_mutex;
+	struct super_block *rf_sb;
+};
+
+#endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 06/41] ocfs2: Add caching info for refcount tree.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (4 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 05/41] ocfs2: Add new refcount tree lock resource in dlmglue Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 07/41] ocfs2: Add refcount tree lock mechanism Tao Ma
                   ` (35 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

refcount tree should use its own caching info so that when
we downconvert the refcount tree lock, we can drop all the
cached buffer head.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |   59 +++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 59 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a923535..eb0f4a0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -26,6 +26,13 @@
 #include "super.h"
 #include "buffer_head_io.h"
 #include "blockcheck.h"
+#include "refcounttree.h"
+
+static inline struct ocfs2_refcount_tree *
+cache_info_to_refcount(struct ocfs2_caching_info *ci)
+{
+	return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
+}
 
 static int ocfs2_validate_refcount_block(struct super_block *sb,
 					 struct buffer_head *bh)
@@ -97,3 +104,55 @@ static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
 
 	return rc;
 }
+
+static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_blkno;
+}
+
+static struct super_block *
+ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_sb;
+}
+
+static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_lock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_unlock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_lock(&rf->rf_io_mutex);
+}
+
+static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_unlock(&rf->rf_io_mutex);
+}
+
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
+	.co_owner		= ocfs2_refcount_cache_owner,
+	.co_get_super		= ocfs2_refcount_cache_get_super,
+	.co_cache_lock		= ocfs2_refcount_cache_lock,
+	.co_cache_unlock	= ocfs2_refcount_cache_unlock,
+	.co_io_lock		= ocfs2_refcount_cache_io_lock,
+	.co_io_unlock		= ocfs2_refcount_cache_io_unlock,
+};
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 07/41] ocfs2: Add refcount tree lock mechanism.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (5 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 06/41] ocfs2: Add caching info for refcount tree Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-19 23:25   ` Joel Becker
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 08/41] ocfs2: Basic tree root operation Tao Ma
                   ` (34 subsequent siblings)
  41 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Create a tree lock named ocfs2_refcount_tree. It protects
all the read/write operation to the refcount tree. Also
it has its only caching_info so that we can protect our
own buffer_head among multiple nodes.

User must call ocfs2_lock_refcount_tree before his operation
on the tree and unlock it after that.

This tree is only indicated by the root refcount block number.
So create a rb-tree for it and store the root in ocfs2_super.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/ocfs2.h        |    4 +
 fs/ocfs2/refcounttree.c |  359 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    7 +
 fs/ocfs2/super.c        |    5 +
 4 files changed, 375 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6688d19..bb53573 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -408,6 +408,10 @@ struct ocfs2_super
 
 	/* the group we used to allocate inodes. */
 	u64				osb_inode_alloc_group;
+
+	/* rb tree root for refcount lock. */
+	struct rb_root	osb_rf_lock_tree;
+	struct ocfs2_refcount_tree *osb_ref_tree_lru;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index eb0f4a0..b6d356b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -27,6 +27,7 @@
 #include "buffer_head_io.h"
 #include "blockcheck.h"
 #include "refcounttree.h"
+#include "dlmglue.h"
 
 static inline struct ocfs2_refcount_tree *
 cache_info_to_refcount(struct ocfs2_caching_info *ci)
@@ -156,3 +157,361 @@ static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
 	.co_io_lock		= ocfs2_refcount_cache_io_lock,
 	.co_io_unlock		= ocfs2_refcount_cache_io_unlock,
 };
+
+static struct ocfs2_refcount_tree *
+ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
+{
+	struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tree = NULL;
+
+	while (n) {
+		tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
+
+		if (blkno < tree->rf_blkno)
+			n = n->rb_left;
+		else if (blkno > tree->rf_blkno)
+			n = n->rb_right;
+		else
+			return tree;
+	}
+
+	return NULL;
+}
+
+/* osb_lock is already locked. */
+static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
+				       struct ocfs2_refcount_tree *new)
+{
+	u64 rf_blkno = new->rf_blkno;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tmp;
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_refcount_tree,
+			       rf_node);
+
+		if (rf_blkno < tmp->rf_blkno)
+			p = &(*p)->rb_left;
+		else if (rf_blkno > tmp->rf_blkno)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
+			     (unsigned long long)rf_blkno);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->rf_node, parent, p);
+	rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
+}
+
+static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
+{
+	ocfs2_metadata_cache_exit(&tree->rf_ci);
+	ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
+	ocfs2_lock_res_free(&tree->rf_lockres);
+	kfree(tree);
+}
+
+static inline void
+ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *tree)
+{
+	rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
+	if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
+		osb->osb_ref_tree_lru = NULL;
+}
+
+static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *tree)
+{
+	spin_lock(&osb->osb_lock);
+	ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+	spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+{
+	struct ocfs2_refcount_tree *tree =
+		container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
+
+	ocfs2_free_refcount_tree(tree);
+}
+
+static inline void
+ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
+{
+	kref_get(&tree->rf_getcnt);
+}
+
+static inline void
+ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
+{
+	kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
+}
+
+static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
+					       struct super_block *sb)
+{
+	ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
+	mutex_init(&new->rf_io_mutex);
+	new->rf_sb = sb;
+	spin_lock_init(&new->rf_lock);
+}
+
+static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
+					struct ocfs2_refcount_tree *new,
+					u64 rf_blkno, u32 generation)
+{
+	init_rwsem(&new->rf_sem);
+	ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
+				     rf_blkno, generation);
+}
+
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+				   struct ocfs2_refcount_tree **ret_tree)
+{
+	int ret = 0;
+	struct ocfs2_refcount_tree *tree, *new = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *ref_rb;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->osb_ref_tree_lru &&
+	    osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
+		tree = osb->osb_ref_tree_lru;
+	else
+		tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	spin_unlock(&osb->osb_lock);
+
+	new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+	if (!new) {
+		ret = -ENOMEM;
+		return ret;
+	}
+
+	new->rf_blkno = rf_blkno;
+	kref_init(&new->rf_getcnt);
+	ocfs2_init_refcount_tree_ci(new, osb->sb);
+
+	/*
+	 * We need the generation to create the refcount tree lock and since
+	 * it isn't changed during the tree modification, we are safe here to
+	 * read without protection.
+	 * We also have to purge the cache after we create the lock since the
+	 * refcount block may have the stale data. It can only be trusted when
+	 * we hold the refcount lock.
+	 */
+	ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_metadata_cache_exit(&new->rf_ci);
+		kfree(new);
+		return ret;
+	}
+
+	ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
+	ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
+				      new->rf_generation);
+	ocfs2_metadata_cache_purge(&new->rf_ci);
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	ocfs2_insert_refcount_tree(osb, new);
+
+	tree = new;
+	new = NULL;
+
+out:
+	*ret_tree = tree;
+
+	osb->osb_ref_tree_lru = tree;
+
+	spin_unlock(&osb->osb_lock);
+
+	if (new)
+		ocfs2_free_refcount_tree(new);
+
+	brelse(ref_root_bh);
+	return ret;
+}
+
+static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+
+	ret = ocfs2_read_inode_block(inode, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	*ref_blkno = le64_to_cpu(di->i_refcount_loc);
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+				      struct ocfs2_refcount_tree *tree, int rw)
+{
+	int ret;
+
+	ret = ocfs2_refcount_lock(tree, rw);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rw)
+		down_write(&tree->rf_sem);
+	else
+		down_read(&tree->rf_sem);
+
+out:
+	return ret;
+}
+
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really need it.
+ *
+ * If the tree has been re-created by other node, it will free the
+ * old one and re-create it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+			     u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **ret_tree,
+			     struct buffer_head **ref_bh)
+{
+	int ret, delete_tree = 0;
+	struct ocfs2_refcount_tree *tree = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+
+again:
+	ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_refcount_tree_get(tree);
+
+	ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_refcount_tree_put(tree);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		ocfs2_unlock_refcount_tree(osb, tree, rw);
+		ocfs2_refcount_tree_put(tree);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	/*
+	 * If the refcount block has been freed and re-created, we may need
+	 * to recreate the refcount tree also.
+	 *
+	 * Here we just remove the tree from the rb-tree, and the last
+	 * kref holder will unlock and delete this refcount_tree.
+	 * Then we goto "again" and ocfs2_get_refcount_tree will create
+	 * the new refcount tree for us.
+	 */
+	if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
+		if (!tree->rf_removed) {
+			ocfs2_erase_refcount_tree_from_list(osb, tree);
+			tree->rf_removed = 1;
+			delete_tree = 1;
+		}
+
+		ocfs2_unlock_refcount_tree(osb, tree, rw);
+		/*
+		 * We get an extra reference when we create the refcount
+		 * tree, so another put will destroy it.
+		 */
+		if (delete_tree)
+			ocfs2_refcount_tree_put(tree);
+		brelse(ref_root_bh);
+		ref_root_bh = NULL;
+		goto again;
+	}
+
+	*ret_tree = tree;
+	if (ref_bh) {
+		*ref_bh = ref_root_bh;
+		ref_root_bh = NULL;
+	}
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
+				      struct ocfs2_refcount_tree **ret_tree,
+				      struct buffer_head **ref_bh)
+{
+	int ret;
+	u64 ref_blkno;
+
+	ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
+					rw, ret_tree, ref_bh);
+}
+
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree, int rw)
+{
+	if (rw)
+		up_write(&tree->rf_sem);
+	else
+		up_read(&tree->rf_sem);
+
+	ocfs2_refcount_unlock(tree, rw);
+	ocfs2_refcount_tree_put(tree);
+}
+
+void ocfs2_purge_refcount_tree(struct ocfs2_super *osb)
+{
+	struct rb_node *node;
+	struct ocfs2_refcount_tree *tree;
+	struct rb_root *root = &osb->osb_rf_lock_tree;
+
+	while ((node = rb_last(root)) != NULL) {
+		tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
+
+		mlog(0, "Purge tree %llu\n",
+		     (unsigned long long) tree->rf_blkno);
+
+		rb_erase(&tree->rf_node, root);
+		ocfs2_free_refcount_tree(tree);
+	}
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9a3695c..3c38bbd 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -33,4 +33,11 @@ struct ocfs2_refcount_tree {
 	struct super_block *rf_sb;
 };
 
+void ocfs2_purge_refcount_tree(struct ocfs2_super *osb);
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **tree,
+			     struct buffer_head **ref_bh);
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree,
+				int rw);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e35a505..4edc298 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
 #include "ver.h"
 #include "xattr.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -1858,6 +1859,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_sync_blockdev(sb);
 
+	ocfs2_purge_refcount_tree(osb);
+
 	/* No cluster connection means we've failed during mount, so skip
 	 * all the steps which depended on that to complete. */
 	if (osb->cconn) {
@@ -2064,6 +2067,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
+	osb->osb_rf_lock_tree = RB_ROOT;
+
 	osb->s_feature_compat =
 		le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
 	osb->s_feature_ro_compat =
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 08/41] ocfs2: Basic tree root operation.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (6 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 07/41] ocfs2: Add refcount tree lock mechanism Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-19 23:30   ` Joel Becker
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 09/41] ocfs2: Wrap ocfs2_extent_contig in ocfs2_extent_tree Tao Ma
                   ` (33 subsequent siblings)
  41 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Add basic refcount tree root operation.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/journal.h      |   14 ++
 fs/ocfs2/refcounttree.c |  343 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 351 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b2dc125..bd88c8b 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -490,6 +490,20 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
 	return credits;
 }
 
+/* inode update, new refcount block and its allocation credits. */
+#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
+					    + OCFS2_SUBALLOC_ALLOC)
+
+/* inode and the refcount block update. */
+#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * inode and the refcount block update.
+ * It doesn't include the credits for sub alloc change.
+ * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
+ */
+#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
 /*
  * Please note that the caller must make sure that root_el is the root
  * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b6d356b..650347b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -27,6 +27,7 @@
 #include "buffer_head_io.h"
 #include "blockcheck.h"
 #include "refcounttree.h"
+#include "sysfile.h"
 #include "dlmglue.h"
 
 static inline struct ocfs2_refcount_tree *
@@ -272,6 +273,22 @@ static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
 				     rf_blkno, generation);
 }
 
+static struct ocfs2_refcount_tree*
+ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
+{
+	struct ocfs2_refcount_tree *new;
+
+	new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+	if (!new)
+		return NULL;
+
+	new->rf_blkno = rf_blkno;
+	kref_init(&new->rf_getcnt);
+	ocfs2_init_refcount_tree_ci(new, osb->sb);
+
+	return new;
+}
+
 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
 				   struct ocfs2_refcount_tree **ret_tree)
 {
@@ -291,16 +308,12 @@ static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
 
 	spin_unlock(&osb->osb_lock);
 
-	new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+	new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
 	if (!new) {
 		ret = -ENOMEM;
+		mlog_errno(ret);
 		return ret;
 	}
-
-	new->rf_blkno = rf_blkno;
-	kref_init(&new->rf_getcnt);
-	ocfs2_init_refcount_tree_ci(new, osb->sb);
-
 	/*
 	 * We need the generation to create the refcount tree lock and since
 	 * it isn't changed during the tree modification, we are safe here to
@@ -515,3 +528,321 @@ void ocfs2_purge_refcount_tree(struct ocfs2_super *osb)
 		ocfs2_free_refcount_tree(tree);
 	}
 }
+
+/*
+ * Create a refcount tree for an inode.
+ * We take for granted that the inode is already locked.
+ */
+static int ocfs2_create_refcount_tree(struct inode *inode,
+				      struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+	mlog(0, "create tree for inode %lu\n", inode->i_ino);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &first_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
+	if (!new_tree) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(rb, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
+	rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
+	rb->rf_blkno = cpu_to_le64(first_blkno);
+	rb->rf_count = cpu_to_le32(1);
+	rb->rf_records.rl_count =
+			cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
+	spin_lock(&osb->osb_lock);
+	rb->rf_generation = osb->s_next_generation++;
+	spin_unlock(&osb->osb_lock);
+
+	ocfs2_journal_dirty(handle, new_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(first_blkno);
+	spin_unlock(&oi->ip_lock);
+
+	mlog(0, "created tree for inode %lu, refblock %llu\n",
+	     inode->i_ino, (unsigned long long)first_blkno);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	/*
+	 * We have to init the tree lock here since it will use
+	 * the generation number to create it.
+	 */
+	new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
+	ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
+				      new_tree->rf_generation);
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, first_blkno);
+
+	/*
+	 * The tree has been deleted and recreated. So remove the old
+	 * tree in this node.
+	 */
+	BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
+	if (tree)
+		ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
+	ocfs2_insert_refcount_tree(osb, new_tree);
+	spin_unlock(&osb->osb_lock);
+	new_tree = NULL;
+	if (tree)
+		ocfs2_refcount_tree_put(tree);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (new_tree) {
+		ocfs2_metadata_cache_exit(&new_tree->rf_ci);
+		kfree(new_tree);
+	}
+
+	brelse(new_bh);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+static int ocfs2_set_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh,
+				   u64 refcount_loc)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+
+	ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+				       &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	le32_add_cpu(&rb->rf_count, 1);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(refcount_loc);
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+
+	return ret;
+}
+
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+	int ret, delete_tree = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct inode *alloc_inode = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+	int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
+	u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
+	u16 bit = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	BUG_ON(!ref_blkno);
+	ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
+
+	/*
+	 * If we are the last user, we need to free the block.
+	 * So lock the allocator ahead.
+	 */
+	if (le32_to_cpu(rb->rf_count) == 1) {
+		blk = le64_to_cpu(rb->rf_blkno);
+		bit = le16_to_cpu(rb->rf_suballoc_bit);
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+		alloc_inode = ocfs2_get_system_file_inode(osb,
+					EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(rb->rf_suballoc_slot));
+		if (!alloc_inode) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+		mutex_lock(&alloc_inode->i_mutex);
+
+		ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_mutex;
+		}
+
+		credits += OCFS2_SUBALLOC_FREE;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = 0;
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	le32_add_cpu(&rb->rf_count , -1);
+	ocfs2_journal_dirty(handle, blk_bh);
+
+	if (!rb->rf_count) {
+		delete_tree = 1;
+		ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
+		ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
+					       alloc_bh, bit, bg_blkno, 1);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	if (alloc_inode) {
+		ocfs2_inode_unlock(alloc_inode, 1);
+		brelse(alloc_bh);
+	}
+out_mutex:
+	if (alloc_inode) {
+		mutex_unlock(&alloc_inode->i_mutex);
+		iput(alloc_inode);
+	}
+out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	if (delete_tree)
+		ocfs2_refcount_tree_put(ref_tree);
+	brelse(blk_bh);
+
+	return ret;
+}
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 09/41] ocfs2: Wrap ocfs2_extent_contig in ocfs2_extent_tree.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (7 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 08/41] ocfs2: Basic tree root operation Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 10/41] ocfs2: Abstract extent split process Tao Ma
                   ` (32 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Add a new operation eo_ocfs2_extent_contig int the extent tree's
operations vector. So that with the new refcount tree, We want
this so that refcount trees can always return CONTIG_NONE and
prevent extent merging.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c |   57 ++++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ab4d2b5..75e65df 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -52,7 +52,17 @@
 
 #include "buffer_head_io.h"
 
+enum ocfs2_contig_type {
+	CONTIG_NONE = 0,
+	CONTIG_LEFT,
+	CONTIG_RIGHT,
+	CONTIG_LEFTRIGHT,
+};
 
+static enum ocfs2_contig_type
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec);
 /*
  * Operations for a specific extent tree type.
  *
@@ -122,6 +132,16 @@ struct ocfs2_extent_tree_operations {
 	 * to 0 (unlimited).  Optional.
 	 */
 	void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
+	 * are contiguous or not. Optional. Don't need to set it if use
+	 * ocfs2_extent_rec as the tree leaf.
+	 */
+	enum ocfs2_contig_type
+		(*eo_extent_contig)(struct ocfs2_extent_tree *et,
+				    struct ocfs2_extent_rec *ext,
+				    struct ocfs2_extent_rec *insert_rec);
 };
 
 
@@ -458,6 +478,19 @@ static inline int ocfs2_et_root_journal_access(handle_t *handle,
 					  type);
 }
 
+static inline enum ocfs2_contig_type
+	ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec,
+			       struct ocfs2_extent_rec *insert_rec)
+{
+	if (et->et_ops->eo_extent_contig)
+		return et->et_ops->eo_extent_contig(et, rec, insert_rec);
+
+	return ocfs2_extent_rec_contig(
+				ocfs2_metadata_cache_get_super(et->et_ci),
+				rec, insert_rec);
+}
+
 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
 					struct ocfs2_extent_rec *rec)
 {
@@ -736,17 +769,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
 	return ret;
 }
 
-enum ocfs2_contig_type {
-	CONTIG_NONE = 0,
-	CONTIG_LEFT,
-	CONTIG_RIGHT,
-	CONTIG_LEFTRIGHT,
-};
-
-
 /*
  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
- * ocfs2_extent_contig only work properly against leaf nodes!
+ * ocfs2_extent_rec_contig only work properly against leaf nodes!
  */
 static int ocfs2_block_extent_contig(struct super_block *sb,
 				     struct ocfs2_extent_rec *ext,
@@ -772,9 +797,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
 }
 
 static enum ocfs2_contig_type
-	ocfs2_extent_contig(struct super_block *sb,
-			    struct ocfs2_extent_rec *ext,
-			    struct ocfs2_extent_rec *insert_rec)
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec)
 {
 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
 
@@ -4400,7 +4425,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
 				ret = CONTIG_RIGHT;
 		} else {
-			ret = ocfs2_extent_contig(sb, rec, split_rec);
+			ret = ocfs2_et_extent_contig(et, rec, split_rec);
 		}
 	}
 
@@ -4445,7 +4470,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 	if (rec) {
 		enum ocfs2_contig_type contig_type;
 
-		contig_type = ocfs2_extent_contig(sb, rec, split_rec);
+		contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
 
 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
 			ret = CONTIG_LEFTRIGHT;
@@ -4473,8 +4498,8 @@ static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-		contig_type = ocfs2_extent_contig(ocfs2_metadata_cache_get_super(et->et_ci),
-						  &el->l_recs[i], insert_rec);
+		contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+						     insert_rec);
 		if (contig_type != CONTIG_NONE) {
 			insert->ins_contig_index = i;
 			break;
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4
@ 2009-08-18  6:19 Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 01/41] ocfs2: Define refcount tree structure Tao Ma
                   ` (41 more replies)
  0 siblings, 42 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Hi all,
     So I have finally finished the v4 of reflink for ocfs2. Great 
thanks for Joel's review.

[View]
http://oss.oracle.com/git/?p=tma/linux-2.6.git;a=shortlog;h=refcount
[Pull]
git://oss.oracle.com/git/tma/linux-2.6.git refcount

The general information for reflink, please see
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/Reflink.

For the design doc, please see
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/RefcountTrees
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/ReflinkOperation
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/ReflinkUses

The patch set is based on Joel's merge-window.

Enjoy it.

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 10/41] ocfs2: Abstract extent split process.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (8 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 09/41] ocfs2: Wrap ocfs2_extent_contig in ocfs2_extent_tree Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 11/41] ocfs2: Add refcount b-tree as a new extent tree Tao Ma
                   ` (31 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

ocfs2_mark_extent_written actually does the following things:
1. check the parameters.
2. initialize the left_path and split_rec.
3. call __ocfs2_mark_extent_written. it will do:
   1) check the flags of unwritten
   2) do the real split work.
The whole process is packed tightly somehow. So this patch
will abstract 2 different functions so that future b-tree
operation can work with it.

1. __ocfs2_split_extent will accept path and split_rec and do
  the real split work.
2. ocfs2_change_extent_flag will accept a new flag and initialize
   path and split_rec.

So now ocfs2_mark_extent_written will do:
1. check the parameters.
2. call ocfs2_change_extent_flag.
   1) initalize the left_path and split_rec.
   2) check whether the new flags conflict with the old one.
   3) call __ocfs2_split_extent to do the split.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c |  150 ++++++++++++++++++++++++++++++++++++------------------
 1 files changed, 100 insertions(+), 50 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 75e65df..14b9106 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5032,9 +5032,8 @@ out:
 }
 
 /*
- * Mark part or all of the extent record at split_index in the leaf
- * pointed to by path as written. This removes the unwritten
- * extent flag.
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
  *
  * Care is taken to handle contiguousness so as to not grow the tree.
  *
@@ -5051,13 +5050,13 @@ out:
  * have been brought into cache (and pinned via the journal), so the
  * extra overhead is not expressed in terms of disk reads.
  */
-static int __ocfs2_mark_extent_written(handle_t *handle,
-				       struct ocfs2_extent_tree *et,
-				       struct ocfs2_path *path,
-				       int split_index,
-				       struct ocfs2_extent_rec *split_rec,
-				       struct ocfs2_alloc_context *meta_ac,
-				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+static int __ocfs2_split_extent(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_path *path,
+				int split_index,
+				struct ocfs2_extent_rec *split_rec,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0;
 	struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5066,12 +5065,6 @@ static int __ocfs2_mark_extent_written(handle_t *handle,
 	struct ocfs2_merge_ctxt ctxt;
 	struct ocfs2_extent_list *rightmost_el;
 
-	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
-		ret = -EIO;
-		mlog_errno(ret);
-		goto out;
-	}
-
 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
@@ -5141,42 +5134,31 @@ out:
 }
 
 /*
- * Mark the already-existing extent at cpos as written for len clusters.
+ * Change the flags of the already-existing extent at cpos for len clusters.
+ *
+ * new_flags: the flags we want to set.
+ * clear_flags: the flags we want to clear.
+ * phys: the new physical offset we want this new extent starts from.
  *
  * If the existing extent is larger than the request, initiate a
  * split. An attempt will be made at merging with adjacent extents.
  *
  * The caller is responsible for passing down meta_ac if we'll need it.
  */
-int ocfs2_mark_extent_written(struct inode *inode,
-			      struct ocfs2_extent_tree *et,
-			      handle_t *handle, u32 cpos, u32 len, u32 phys,
-			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+static int ocfs2_change_extent_flag(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, u32 len, u32 phys,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc,
+				    int new_flags, int clear_flags)
 {
 	int ret, index;
-	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
 	struct ocfs2_extent_rec split_rec;
 	struct ocfs2_path *left_path = NULL;
 	struct ocfs2_extent_list *el;
-
-	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
-	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
-
-	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
-		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
-			    "that are being written to, but the feature bit "
-			    "is not set in the super block.",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		ret = -EROFS;
-		goto out;
-	}
-
-	/*
-	 * XXX: This should be fixed up so that we just re-insert the
-	 * next extent records.
-	 */
-	ocfs2_et_extent_map_truncate(et, 0);
+	struct ocfs2_extent_rec *rec;
 
 	left_path = ocfs2_new_path_from_et(et);
 	if (!left_path) {
@@ -5194,30 +5176,98 @@ int ocfs2_mark_extent_written(struct inode *inode,
 
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has an extent at cpos %u which can no "
+		ocfs2_error(sb,
+			    "Owner %llu has an extent at cpos %u which can no "
 			    "longer be found.\n",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+			     (unsigned long long)
+			     ocfs2_metadata_cache_owner(et->et_ci), cpos);
 		ret = -EROFS;
 		goto out;
 	}
 
+	ret = -EIO;
+	rec = &el->l_recs[index];
+	if (new_flags && (rec->e_flags & new_flags)) {
+		mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
+		     "extent that already had them",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     new_flags);
+		goto out;
+	}
+
+	if (clear_flags && !(rec->e_flags & clear_flags)) {
+		mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
+		     "extent that didn't have them",
+		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		     clear_flags);
+		goto out;
+	}
+
 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
 	split_rec.e_cpos = cpu_to_le32(cpos);
 	split_rec.e_leaf_clusters = cpu_to_le16(len);
 	split_rec.e_blkno = cpu_to_le64(start_blkno);
-	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
-	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
-
-	ret = __ocfs2_mark_extent_written(handle, et, left_path,
-					  index, &split_rec, meta_ac,
-					  dealloc);
+	split_rec.e_flags = rec->e_flags;
+	if (new_flags)
+		split_rec.e_flags |= new_flags;
+	if (clear_flags)
+		split_rec.e_flags &= ~clear_flags;
+
+	ret = __ocfs2_split_extent(handle, et, left_path,
+				  index, &split_rec, meta_ac,
+				  dealloc);
 	if (ret)
 		mlog_errno(ret);
 
 out:
 	ocfs2_free_path(left_path);
 	return ret;
+
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
+	     inode->i_ino, cpos, len, phys);
+
+	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+			    "that are being written to, but the feature bit "
+			    "is not set in the super block.",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * XXX: This should be fixed up so that we just re-insert the
+	 * next extent records.
+	 */
+	ocfs2_et_extent_map_truncate(et, 0);
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       0, OCFS2_EXT_UNWRITTEN);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
 }
 
 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 11/41] ocfs2: Add refcount b-tree as a new extent tree.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (9 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 10/41] ocfs2: Abstract extent split process Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 12/41] ocfs2: move tree path functions to alloc.h Tao Ma
                   ` (30 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Add refcount b-tree as a new extent tree so that it can
use the b-tree to store and maniuplate ocfs2_refcount_rec.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h |    3 +++
 2 files changed, 57 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 14b9106..a629656 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -385,6 +385,52 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
 	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
 };
 
+static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	et->et_root_el = &rb->rf_list;
+}
+
+static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+						u64 blkno)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	rb->rf_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	return le64_to_cpu(rb->rf_last_eb_blk);
+}
+
+static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
+						u32 clusters)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	le32_add_cpu(&rb->rf_clusters, clusters);
+}
+
+static enum ocfs2_contig_type
+ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
+				  struct ocfs2_extent_rec *ext,
+				  struct ocfs2_extent_rec *insert_rec)
+{
+	return CONTIG_NONE;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_refcount_tree_fill_root_el,
+	.eo_extent_contig	= ocfs2_refcount_tree_extent_contig,
+};
+
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 				     struct ocfs2_caching_info *ci,
 				     struct buffer_head *bh,
@@ -439,6 +485,14 @@ void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
 				 NULL, &ocfs2_dx_root_et_ops);
 }
 
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
+				 NULL, &ocfs2_refcount_tree_et_ops);
+}
+
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					    u64 new_last_eb_blk)
 {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index bcf6aa4..df0e778 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -80,6 +80,9 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
 				    struct ocfs2_caching_info *ci,
 				    struct buffer_head *bh);
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh);
 
 /*
  * Read an extent block into *bh.  If *bh is NULL, a bh will be
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 12/41] ocfs2: move tree path functions to alloc.h.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (10 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 11/41] ocfs2: Add refcount b-tree as a new extent tree Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 13/41] ocfs2: Add support for incrementing refcount in the tree Tao Ma
                   ` (29 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Now fs/ocfs2/alloc.c has more than 7000 lines. It contains our
basic b-tree operation. Although we have already make our b-tree
operation generic, the basic structrue ocfs2_path which is used
to iterate one b-tree branch is still static and limited to only
used in alloc.c. As refcount tree need them and I don't want to
add any more b-tree unrelated code to alloc.c, export them out.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c |   76 ++++++++++++++++-------------------------------------
 fs/ocfs2/alloc.h |   49 ++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 53 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index a629656..2c8ce32 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -567,36 +567,6 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 					 struct ocfs2_extent_block *eb);
-
-/*
- * Structures which describe a path through a btree, and functions to
- * manipulate them.
- *
- * The idea here is to be as generic as possible with the tree
- * manipulation code.
- */
-struct ocfs2_path_item {
-	struct buffer_head		*bh;
-	struct ocfs2_extent_list	*el;
-};
-
-#define OCFS2_MAX_PATH_DEPTH	5
-
-struct ocfs2_path {
-	int				p_tree_depth;
-	ocfs2_journal_access_func	p_root_access;
-	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
-};
-
-#define path_root_bh(_path) ((_path)->p_node[0].bh)
-#define path_root_el(_path) ((_path)->p_node[0].el)
-#define path_root_access(_path)((_path)->p_root_access)
-#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
-#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
-#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-
-static int ocfs2_find_path(struct ocfs2_caching_info *ci,
-			   struct ocfs2_path *path, u32 cpos);
 static void ocfs2_adjust_rightmost_records(handle_t *handle,
 					   struct ocfs2_extent_tree *et,
 					   struct ocfs2_path *path,
@@ -606,7 +576,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
  * to build another path. Generally, this involves freeing the buffer
  * heads.
  */
-static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
 {
 	int i, start = 0, depth = 0;
 	struct ocfs2_path_item *node;
@@ -635,7 +605,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
 	path->p_tree_depth = depth;
 }
 
-static void ocfs2_free_path(struct ocfs2_path *path)
+void ocfs2_free_path(struct ocfs2_path *path)
 {
 	if (path) {
 		ocfs2_reinit_path(path, 0);
@@ -733,13 +703,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 	return path;
 }
 
-static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
 {
 	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
 			      path_root_access(path));
 }
 
-static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
 {
 	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
 			      et->et_root_journal_access);
@@ -752,10 +722,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
  * I don't like the way this function's name looks next to
  * ocfs2_journal_access_path(), but I don't have a better one.
  */
-static int ocfs2_path_bh_journal_access(handle_t *handle,
-					struct ocfs2_caching_info *ci,
-					struct ocfs2_path *path,
-					int idx)
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx)
 {
 	ocfs2_journal_access_func access = path_root_access(path);
 
@@ -772,9 +742,9 @@ static int ocfs2_path_bh_journal_access(handle_t *handle,
 /*
  * Convenience function to journal all components in a path.
  */
-static int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
-				     handle_t *handle,
-				     struct ocfs2_path *path)
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path)
 {
 	int i, ret = 0;
 
@@ -1942,8 +1912,8 @@ static void find_path_ins(void *data, struct buffer_head *bh)
 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
 	fp->index++;
 }
-static int ocfs2_find_path(struct ocfs2_caching_info *ci,
-			   struct ocfs2_path *path, u32 cpos)
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path, u32 cpos)
 {
 	struct find_path_data data;
 
@@ -5104,13 +5074,13 @@ out:
  * have been brought into cache (and pinned via the journal), so the
  * extra overhead is not expressed in terms of disk reads.
  */
-static int __ocfs2_split_extent(handle_t *handle,
-				struct ocfs2_extent_tree *et,
-				struct ocfs2_path *path,
-				int split_index,
-				struct ocfs2_extent_rec *split_rec,
-				struct ocfs2_alloc_context *meta_ac,
-				struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0;
 	struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5267,9 +5237,9 @@ static int ocfs2_change_extent_flag(handle_t *handle,
 	if (clear_flags)
 		split_rec.e_flags &= ~clear_flags;
 
-	ret = __ocfs2_split_extent(handle, et, left_path,
-				  index, &split_rec, meta_ac,
-				  dealloc);
+	ret = ocfs2_split_extent(handle, et, left_path,
+				 index, &split_rec, meta_ac,
+				 dealloc);
 	if (ret)
 		mlog_errno(ret);
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index df0e778..3f43489 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -115,6 +115,14 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret);
 struct ocfs2_cached_dealloc_ctxt;
+struct ocfs2_path;
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_mark_extent_written(struct inode *inode,
 			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
@@ -254,4 +262,45 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 	return !rec->e_leaf_clusters;
 }
 
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+	struct buffer_head		*bh;
+	struct ocfs2_extent_list	*el;
+};
+
+#define OCFS2_MAX_PATH_DEPTH	5
+
+struct ocfs2_path {
+	int				p_tree_depth;
+	ocfs2_journal_access_func	p_root_access;
+	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
+void ocfs2_free_path(struct ocfs2_path *path);
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path,
+		    u32 cpos);
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx);
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path);
 #endif /* OCFS2_ALLOC_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 13/41] ocfs2: Add support for incrementing refcount in the tree.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (11 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 12/41] ocfs2: move tree path functions to alloc.h Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 14/41] ocfs2: Add support of decrementing refcount for delete Tao Ma
                   ` (28 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

    Given a physical cpos and length, increment the refcount
in the tree. If the extent has not been seen before, a refcount
record is created for it. Refcount records may be merged or
split by this operation.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/extent_map.c   |   15 +-
 fs/ocfs2/extent_map.h   |    5 +
 fs/ocfs2/ocfs2_fs.h     |    7 +
 fs/ocfs2/refcounttree.c | 1053 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1073 insertions(+), 7 deletions(-)

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index dc9482c..40b5105 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
  * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
  * containing el.
  */
-static int ocfs2_figure_hole_clusters(struct inode *inode,
-				      struct ocfs2_extent_list *el,
-				      struct buffer_head *eb_bh,
-				      u32 v_cluster,
-				      u32 *num_clusters)
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters)
 {
 	int ret, i;
 	struct buffer_head *next_eb_bh = NULL;
@@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 
-		ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+		ret = ocfs2_read_extent_block(ci,
 					      le64_to_cpu(eb->h_next_leaf_blk),
 					      &next_eb_bh);
 		if (ret) {
@@ -456,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 		 * field.
 		 */
 		if (hole_len) {
-			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+			ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
+							 el, eb_bh,
 							 v_cluster, &len);
 			if (ret) {
 				mlog_errno(ret);
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index b7dd973..9942f47 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -61,6 +61,11 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
 			   struct buffer_head *bhs[], int flags,
 			   int (*validate)(struct super_block *sb,
 					   struct buffer_head *bh));
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters);
 static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
 					struct buffer_head **bh,
 					int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e4288b4..40072cd 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -916,6 +916,7 @@ struct ocfs2_refcount_rec {
 	__le32 r_refcount;	/* Reference count of this extent */
 /*10*/
 };
+#define OCFS2_32BIT_POS_MASK		(0xffffffffULL)
 
 #define OCFS2_REFCOUNT_LEAF_FL          (0x00000001)
 #define OCFS2_REFCOUNT_TREE_FL          (0x00000002)
@@ -1394,6 +1395,12 @@ static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
 
 	return size / sizeof(struct ocfs2_refcount_rec);
 }
+
+static inline u32
+ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
+{
+	return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 650347b..4c3f5e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -15,6 +15,7 @@
  * General Public License for more details.
  */
 
+#include <linux/sort.h>
 #define MLOG_MASK_PREFIX ML_REFCOUNT
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -29,6 +30,7 @@
 #include "refcounttree.h"
 #include "sysfile.h"
 #include "dlmglue.h"
+#include "extent_map.h"
 
 static inline struct ocfs2_refcount_tree *
 cache_info_to_refcount(struct ocfs2_caching_info *ci)
@@ -846,3 +848,1054 @@ out:
 
 	return ret;
 }
+
+static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
+					  struct buffer_head *ref_leaf_bh,
+					  u64 cpos, unsigned int len,
+					  struct ocfs2_refcount_rec *ret_rec,
+					  int *index)
+{
+	int i = 0;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = NULL;
+
+	for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
+		rec = &rb->rf_records.rl_recs[i];
+
+		if (le64_to_cpu(rec->r_cpos) +
+		    le32_to_cpu(rec->r_clusters) <= cpos)
+			continue;
+		else if (le64_to_cpu(rec->r_cpos) > cpos)
+			break;
+
+		/* ok, cpos fail in this rec. Just return. */
+		if (ret_rec)
+			*ret_rec = *rec;
+		goto out;
+	}
+
+	if (ret_rec) {
+		/* We meet with a hole here, so fake the rec. */
+		ret_rec->r_cpos = cpu_to_le64(cpos);
+		ret_rec->r_refcount = 0;
+		if (i < le16_to_cpu(rb->rf_records.rl_used) &&
+		    le64_to_cpu(rec->r_cpos) < cpos + len)
+			ret_rec->r_clusters =
+				cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
+		else
+			ret_rec->r_clusters = cpu_to_le32(len);
+	}
+
+out:
+	*index = i;
+}
+
+/*
+ * Given a cpos and len, try to find the refcount record which contains cpos.
+ * 1. If cpos can be found in one refcount record, return the record.
+ * 2. If cpos can't be found, return a fake record which start from cpos
+ *    and end at a small value between cpos+len and start of the next record.
+ *    This fake record has r_refcount = 0.
+ */
+static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
+				  struct buffer_head *ref_root_bh,
+				  u64 cpos, unsigned int len,
+				  struct ocfs2_refcount_rec *ret_rec,
+				  int *index,
+				  struct buffer_head **ret_bh)
+{
+	int ret = 0, i, found;
+	u32 low_cpos;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *tmp, *rec = NULL;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
+		ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
+					      ret_rec, index);
+		*ret_bh = ref_root_bh;
+		get_bh(ref_root_bh);
+		return 0;
+	}
+
+	el = &rb->rf_list;
+	low_cpos = cpos & OCFS2_32BIT_POS_MASK;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(sb,
+			"refcount tree %llu has non zero tree "
+			"depth in leaf btree tree block %llu\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci),
+			(unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	found = 0;
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
+			found = 1;
+			break;
+		}
+	}
+
+	/* adjust len when we have ocfs2_extent_rec after it. */
+	if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
+		tmp = &el->l_recs[i+1];
+
+		if (le32_to_cpu(tmp->e_cpos) < cpos + len)
+			len = le32_to_cpu(tmp->e_cpos) - cpos;
+	}
+
+	ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
+					&ref_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
+				      ret_rec, index);
+	*ret_bh = ref_leaf_bh;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+enum ocfs2_ref_rec_contig {
+	REF_CONTIG_NONE = 0,
+	REF_CONTIG_LEFT,
+	REF_CONTIG_RIGHT,
+	REF_CONTIG_LEFTRIGHT,
+};
+
+static enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
+				    int index)
+{
+	if ((rb->rf_records.rl_recs[index].r_refcount ==
+	    rb->rf_records.rl_recs[index + 1].r_refcount) &&
+	    (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
+	    le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
+	    le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
+		return REF_CONTIG_RIGHT;
+
+	return REF_CONTIG_NONE;
+}
+
+static enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
+				  int index)
+{
+	enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
+		ret = ocfs2_refcount_rec_adjacent(rb, index);
+
+	if (index > 0) {
+		enum ocfs2_ref_rec_contig tmp;
+
+		tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
+
+		if (tmp == REF_CONTIG_RIGHT) {
+			if (ret == REF_CONTIG_RIGHT)
+				ret = REF_CONTIG_LEFTRIGHT;
+			else
+				ret = REF_CONTIG_LEFT;
+		}
+	}
+
+	return ret;
+}
+
+static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
+					   int index)
+{
+	BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
+	       rb->rf_records.rl_recs[index+1].r_refcount);
+
+	le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
+		     le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
+		memmove(&rb->rf_records.rl_recs[index + 1],
+			&rb->rf_records.rl_recs[index + 2],
+			sizeof(struct ocfs2_refcount_rec) *
+			(le16_to_cpu(rb->rf_records.rl_used) - index - 2));
+
+	memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
+	       0, sizeof(struct ocfs2_refcount_rec));
+	le16_add_cpu(&rb->rf_records.rl_used, -1);
+}
+
+/*
+ * Merge the refcount rec if we are contiguous with the adjacent recs.
+ */
+static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
+				     int index)
+{
+	enum ocfs2_ref_rec_contig contig =
+				ocfs2_refcount_rec_contig(rb, index);
+
+	if (contig == REF_CONTIG_NONE)
+		return;
+
+	if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
+		BUG_ON(index == 0);
+		index--;
+	}
+
+	ocfs2_rotate_refcount_rec_left(rb, index);
+
+	if (contig == REF_CONTIG_LEFTRIGHT)
+		ocfs2_rotate_refcount_rec_left(rb, index);
+}
+
+static int ocfs2_change_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_leaf_bh,
+				     int index, int change)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "change index %d, old count %u, change %d\n", index,
+	     le32_to_cpu(rec->r_refcount), change);
+	le32_add_cpu(&rec->r_refcount, change);
+
+	ocfs2_refcount_rec_merge(rb, index);
+
+	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static int ocfs2_expand_inline_ref_root(handle_t *handle,
+					struct ocfs2_caching_info *ci,
+					struct buffer_head *ref_root_bh,
+					struct buffer_head **ref_leaf_bh,
+					struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+	struct ocfs2_refcount_block *root_rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Initialize ocfs2_refcount_block.
+	 * It should contain the same information as the old root.
+	 * so just memcpy it and change the corresponding field.
+	 */
+	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
+
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_cpos = cpu_to_le32(0);
+	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	/* Now change the root. */
+	memset(&root_rb->rf_list, 0, sb->s_blocksize -
+	       offsetof(struct ocfs2_refcount_block, rf_list));
+	root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
+	root_rb->rf_clusters = cpu_to_le32(1);
+	root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
+	root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+	root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
+	     le16_to_cpu(new_rb->rf_records.rl_used));
+
+	*ref_leaf_bh = new_bh;
+	new_bh = NULL;
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
+					   struct ocfs2_refcount_rec *next)
+{
+	if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
+		ocfs2_get_ref_rec_low_cpos(next))
+		return 1;
+
+	return 0;
+}
+
+static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
+	u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u64 l_cpos = le64_to_cpu(l->r_cpos);
+	u64 r_cpos = le64_to_cpu(r->r_cpos);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static void swap_refcount_rec(void *a, void *b, int size)
+{
+	struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+
+	tmp = *(struct ocfs2_refcount_rec *)l;
+	*(struct ocfs2_refcount_rec *)l =
+			*(struct ocfs2_refcount_rec *)r;
+	*(struct ocfs2_refcount_rec *)r = tmp;
+}
+
+/*
+ * The refcount cpos are ordered by their 64bit cpos,
+ * But we will use the low 32 bit to be the e_cpos in the b-tree.
+ * So we need to make sure that this pos isn't intersected with others.
+ *
+ * Note: The refcount block is already sorted by their low 32 bit cpos,
+ *       So just try the middle pos first, and we will exit when we find
+ *       the good position.
+ */
+static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
+					 u32 *split_pos, int *split_index)
+{
+	int num_used = le16_to_cpu(rl->rl_used);
+	int delta, middle = num_used / 2;
+
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle - delta - 1],
+					&rl->rl_recs[middle - delta])) {
+			*split_index = middle - delta;
+			break;
+		}
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == num_used)
+			continue;
+
+		/* Now try delta past middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle + delta],
+					&rl->rl_recs[middle + delta + 1])) {
+			*split_index = middle + delta + 1;
+			break;
+		}
+	}
+
+	if (delta >= middle)
+		return -ENOSPC;
+
+	*split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
+	return 0;
+}
+
+static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
+					    struct buffer_head *new_bh,
+					    u32 *split_cpos)
+{
+	int split_index = 0, num_moved, ret;
+	u32 cpos = 0;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_block *new_rb =
+			(struct ocfs2_refcount_block *)new_bh->b_data;
+	struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
+
+	mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
+	     (unsigned long long)ref_leaf_bh->b_blocknr,
+	     le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
+
+	/*
+	 * XXX: Improvement later.
+	 * If we know all the high 32 bit cpos is the same, no need to sort.
+	 *
+	 * In order to make the whole process safe, we do:
+	 * 1. sort the entries by their low 32 bit cpos first so that we can
+	 *    find the split cpos easily.
+	 * 2. call ocfs2_insert_extent to insert the new refcount block.
+	 * 3. move the refcount rec to the new block.
+	 * 4. sort the entries by their 64 bit cpos.
+	 * 5. dirty the new_rb and rb.
+	 */
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+
+	ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	new_rb->rf_cpos = cpu_to_le32(cpos);
+
+	/* move refcount records starting from split_index to the new block. */
+	num_moved = le16_to_cpu(rl->rl_used) - split_index;
+	memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	/*ok, remove the entries we just moved over to the other block. */
+	memset(&rl->rl_recs[split_index], 0,
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	/* change old and new rl_used accordingly. */
+	le16_add_cpu(&rl->rl_used, -num_moved);
+	new_rl->rl_used = cpu_to_le32(num_moved);
+
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+	sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+	*split_cpos = cpos;
+	return 0;
+}
+
+static int ocfs2_new_leaf_refcount_block(handle_t *handle,
+					 struct ocfs2_caching_info *ci,
+					 struct buffer_head *ref_root_bh,
+					 struct buffer_head *ref_leaf_bh,
+					 struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got, new_cpos;
+	u64 blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *root_rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+	struct ocfs2_extent_tree ref_et;
+
+	BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(new_rb, 0, sb->s_blocksize);
+	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
+	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	new_rb->rf_records.rl_count =
+				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+	new_rb->rf_generation = root_rb->rf_generation;
+
+	ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
+
+	mlog(0, "insert new leaf block %llu at %u\n",
+	     (unsigned long long)new_bh->b_blocknr, new_cpos);
+
+	/* Insert the new leaf block with the specific offset cpos. */
+	ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
+				  1, 0, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_expand_refcount_tree(handle_t *handle,
+				      struct ocfs2_caching_info *ci,
+				      struct buffer_head *ref_root_bh,
+				      struct buffer_head *ref_leaf_bh,
+				      struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct buffer_head *expand_bh = NULL;
+
+	if (ref_root_bh == ref_leaf_bh) {
+		/*
+		 * the old root bh hasn't been expanded to a b-tree,
+		 * so expand it first.
+		 */
+		ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
+						   &expand_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		expand_bh = ref_leaf_bh;
+		get_bh(expand_bh);
+	}
+
+
+	/* Now add a new refcount block into the tree.*/
+	ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
+					    expand_bh, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(expand_bh);
+	return ret;
+}
+
+/*
+ * Adjust the extent rec in b-tree representing ref_leaf_bh.
+ *
+ * Only called when we have inserted a new refcount rec at index 0
+ * which means ocfs2_extent_rec.e_cpos may need some change.
+ */
+static int ocfs2_adjust_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     struct buffer_head *ref_leaf_bh,
+				     struct ocfs2_refcount_rec *rec)
+{
+	int ret = 0, i;
+	u32 new_cpos, old_cpos;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_refcount_block *rb =
+		(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	struct ocfs2_extent_list *el;
+
+	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
+		goto out;
+
+	rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	old_cpos = le32_to_cpu(rb->rf_cpos);
+	new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+	if (old_cpos <= new_cpos)
+		goto out;
+
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+	path = ocfs2_new_path_from_et(&et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(ci, path, old_cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * 2 more credits, one for the leaf refcount block, one for
+	 * the extent block contains the extent rec.
+	 */
+	ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* change the leaf extent block first. */
+	el = path_leaf_el(path);
+
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
+		if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
+			break;
+
+	BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
+
+	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+
+	/* change the r_cpos in the leaf block. */
+	rb->rf_cpos = cpu_to_le32(new_cpos);
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_insert_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     struct buffer_head *ref_leaf_bh,
+				     struct ocfs2_refcount_rec *rec,
+				     int index,
+				     struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+	if (rf_list->rl_used == rf_list->rl_count) {
+		u64 cpos = le64_to_cpu(rec->r_cpos);
+		u32 len = le32_to_cpu(rec->r_clusters);
+
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, NULL, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (index < le16_to_cpu(rf_list->rl_used))
+		memmove(&rf_list->rl_recs[index + 1],
+			&rf_list->rl_recs[index],
+			(le16_to_cpu(rf_list->rl_used) - index) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	mlog(0, "insert refcount record start %llu, len %u, count %u "
+	     "to leaf block %llu at index %d\n",
+	     (unsigned long long)le64_to_cpu(rec->r_cpos),
+	     le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
+	     (unsigned long long)ref_leaf_bh->b_blocknr, index);
+
+	rf_list->rl_recs[index] = *rec;
+
+	le16_add_cpu(&rf_list->rl_used, 1);
+
+	ocfs2_refcount_rec_merge(rb, index);
+
+	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (index == 0) {
+		ret = ocfs2_adjust_refcount_rec(handle, ci,
+						ref_root_bh,
+						ref_leaf_bh, rec);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+/*
+ * Split the refcount_rec indexed by "index" in ref_leaf_bh.
+ * This is much simple than our b-tree code.
+ * split_rec is the new refcount rec we want to insert.
+ * If split_rec->r_refcount > 0, we are changing the refcount(in case we
+ * increase refcount or decrease a refcount to non-zero).
+ * If split_rec->r_refcount == 0, we are punching a hole in current refcount
+ * rec( in case we decrease a refcount to zero).
+ */
+static int ocfs2_split_refcount_rec(handle_t *handle,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *ref_root_bh,
+				    struct buffer_head *ref_leaf_bh,
+				    struct ocfs2_refcount_rec *split_rec,
+				    int index,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, recs_need;
+	u32 len;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
+	struct ocfs2_refcount_rec *tail_rec = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
+
+	mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
+	     le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
+	     le64_to_cpu(split_rec->r_cpos),
+	     le32_to_cpu(split_rec->r_clusters));
+
+	/*
+	 * If we just need to split the header or tail clusters,
+	 * no more recs are needed, just split is OK.
+	 * Otherwise we at least need one new recs.
+	 */
+	if (!split_rec->r_refcount &&
+	    (split_rec->r_cpos == orig_rec->r_cpos ||
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) ==
+	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need = 0;
+	else
+		recs_need = 1;
+
+	/*
+	 * We need one more rec if we split in the middle and the new rec have
+	 * some refcount in it.
+	 */
+	if (split_rec->r_refcount &&
+	    (split_rec->r_cpos != orig_rec->r_cpos &&
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) !=
+	     le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need++;
+
+	/* If the leaf block don't have enough record, expand it. */
+	if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
+		struct ocfs2_refcount_rec tmp_rec;
+		u64 cpos = le64_to_cpu(orig_rec->r_cpos);
+		len = le32_to_cpu(orig_rec->r_clusters);
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * We have to re-get it since now cpos may be moved to
+		 * another leaf block.
+		 */
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &tmp_rec, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+		orig_rec = &rf_list->rl_recs[index];
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We have calculated out how many new records we need and store
+	 * in recs_need, so spare enough space first by moving the records
+	 * after "index" to the end.
+	 */
+	if (index != le16_to_cpu(rf_list->rl_used) - 1)
+		memmove(&rf_list->rl_recs[index + 1 + recs_need],
+			&rf_list->rl_recs[index + 1],
+			(le16_to_cpu(rf_list->rl_used) - index - 1) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	len = (le64_to_cpu(orig_rec->r_cpos) +
+	      le32_to_cpu(orig_rec->r_clusters)) -
+	      (le64_to_cpu(split_rec->r_cpos) +
+	      le32_to_cpu(split_rec->r_clusters));
+
+	/*
+	 * If we have "len", the we will split in the tail and move it
+	 * to the end of the space we have just spared.
+	 */
+	if (len) {
+		tail_rec = &rf_list->rl_recs[index + recs_need];
+
+		memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
+		le64_add_cpu(&tail_rec->r_cpos,
+			     le32_to_cpu(tail_rec->r_clusters) - len);
+		tail_rec->r_clusters = le32_to_cpu(len);
+	}
+
+	/*
+	 * If the split pos isn't the same as the original one, we need to
+	 * split in the head.
+	 *
+	 * Note: We have the chance that split_rec.r_refcount = 0,
+	 * recs_need = 0 and len > 0, which means we just cut the head from
+	 * the orig_rec and in that case we have done some modification in
+	 * orig_rec above, so the check for r_cpos is faked.
+	 */
+	if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
+		len = le64_to_cpu(split_rec->r_cpos) -
+		      le64_to_cpu(orig_rec->r_cpos);
+		orig_rec->r_clusters = cpu_to_le32(len);
+		index++;
+	}
+
+	le16_add_cpu(&rf_list->rl_used, recs_need);
+
+	if (split_rec->r_refcount) {
+		rf_list->rl_recs[index] = *split_rec;
+		mlog(0, "insert refcount record start %llu, len %u, count %u "
+		     "to leaf block %llu at index %d\n",
+		     (unsigned long long)le64_to_cpu(split_rec->r_cpos),
+		     le32_to_cpu(split_rec->r_clusters),
+		     le32_to_cpu(split_rec->r_refcount),
+		     (unsigned long long)ref_leaf_bh->b_blocknr, index);
+
+		ocfs2_refcount_rec_merge(rb, index);
+	}
+
+	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int __ocfs2_increase_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0, index;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_refcount_rec rec;
+	unsigned int set_len = 0;
+
+	mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
+	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+	     (unsigned long long)cpos, len);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		set_len = le32_to_cpu(rec.r_clusters);
+
+		/*
+		 * Here we may meet with 3 situations:
+		 *
+		 * 1. If we find an already existing record, and the length
+		 *    is the same, cool, we just need to increase the r_refcount
+		 *    and it is OK.
+		 * 2. If we find a hole, just insert it with r_refcount = 1.
+		 * 3. If we are in the middle of one extent record, split
+		 *    it.
+		 */
+		if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
+		    set_len <= len) {
+			mlog(0, "increase refcount rec, start %llu, len %u, "
+			     "count %u\n", (unsigned long long)cpos, set_len,
+			     le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_change_refcount_rec(handle, ci,
+							ref_leaf_bh, index, 1);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else if (!rec.r_refcount) {
+			rec.r_refcount = cpu_to_le32(1);
+
+			mlog(0, "insert refcount rec, start %llu, len %u\n",
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len);
+			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
+							ref_leaf_bh,
+							&rec, index, meta_ac);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else  {
+			set_len = min((u64)(cpos + len),
+				      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
+			rec.r_cpos = cpu_to_le64(cpos);
+			rec.r_clusters = cpu_to_le32(set_len);
+			le32_add_cpu(&rec.r_refcount, 1);
+
+			mlog(0, "split refcount rec, start %llu, "
+			     "len %u, count %u\n",
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len, le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_split_refcount_rec(handle, ci,
+						       ref_root_bh, ref_leaf_bh,
+						       &rec, index,
+						       meta_ac, dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += set_len;
+		len -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 14/41] ocfs2: Add support of decrementing refcount for delete.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (12 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 13/41] ocfs2: Add support for incrementing refcount in the tree Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 15/41] ocfs2: Add functions for extents refcounted Tao Ma
                   ` (27 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

    Given a physical cpos and length, decrement the refcount
in the tree. If the refcount for any portion of the extent goes
to zero, that portion is queued for freeing.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c        |    6 +-
 fs/ocfs2/alloc.h        |    3 +
 fs/ocfs2/refcounttree.c |  256 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/refcounttree.h |    5 +
 4 files changed, 265 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2c8ce32..9dd68cd 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6522,9 +6522,9 @@ ocfs2_find_per_slot_free_list(int type,
 	return fl;
 }
 
-static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-				     int type, int slot, u64 blkno,
-				     unsigned int bit)
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+			      int type, int slot, u64 blkno,
+			      unsigned int bit)
 {
 	int ret;
 	struct ocfs2_per_slot_free_list *fl;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3f43489..0610ba1 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -202,6 +202,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 }
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 				u64 blkno, unsigned int bit);
+int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+			      int type, int slot, u64 blkno,
+			      unsigned int bit);
 static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
 {
 	return c->c_global_allocator != NULL;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4c3f5e9..1c6d425 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1069,6 +1069,10 @@ static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
 		ocfs2_rotate_refcount_rec_left(rb, index);
 }
 
+/*
+ * Change the refcount indexed by "index" in ref_bh.
+ * If refcount reaches 0, remove it.
+ */
 static int ocfs2_change_refcount_rec(handle_t *handle,
 				     struct ocfs2_caching_info *ci,
 				     struct buffer_head *ref_leaf_bh,
@@ -1077,7 +1081,8 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
 	int ret;
 	struct ocfs2_refcount_block *rb =
 			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
-	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
 
 	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1090,7 +1095,18 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
 	     le32_to_cpu(rec->r_refcount), change);
 	le32_add_cpu(&rec->r_refcount, change);
 
-	ocfs2_refcount_rec_merge(rb, index);
+	if (!rec->r_refcount) {
+		if (index != le16_to_cpu(rl->rl_used) - 1) {
+			memmove(rec, rec + 1,
+				(le16_to_cpu(rl->rl_used) - index - 1) *
+				sizeof(struct ocfs2_refcount_rec));
+			memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
+			       0, sizeof(struct ocfs2_refcount_rec));
+		}
+
+		le16_add_cpu(&rl->rl_used, -1);
+	} else
+		ocfs2_refcount_rec_merge(rb, index);
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
 	if (ret)
@@ -1899,3 +1915,239 @@ out:
 	brelse(ref_leaf_bh);
 	return ret;
 }
+
+static int ocfs2_remove_refcount_extent(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_extent_tree et;
+
+	BUG_ON(rb->rf_records.rl_used);
+
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+	ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
+				  1, meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_remove_from_cache(ci, ref_leaf_bh);
+
+	/*
+	 * add the freed block to the dealloc so that it will be freed
+	 * when we run dealloc.
+	 */
+	ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(rb->rf_suballoc_slot),
+					le64_to_cpu(rb->rf_blkno),
+					le16_to_cpu(rb->rf_suballoc_bit));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	le32_add_cpu(&rb->rf_clusters, -1);
+
+	/*
+	 * check whether we need to restore the root refcount block if
+	 * there is no leaf extent block at atll.
+	 */
+	if (!rb->rf_list.l_next_free_rec) {
+		BUG_ON(rb->rf_clusters);
+
+		mlog(0, "reset refcount tree root %llu to be a record block.\n",
+		     (unsigned long long)ref_root_bh->b_blocknr);
+
+		rb->rf_flags = 0;
+		rb->rf_parent = 0;
+		rb->rf_cpos = 0;
+		memset(&rb->rf_records, 0, sb->s_blocksize -
+		       offsetof(struct ocfs2_refcount_block, rf_records));
+		rb->rf_records.rl_count =
+				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+	}
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+out:
+	return ret;
+}
+
+static int ocfs2_decrease_refcount_rec(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				int index, u64 cpos, unsigned int len,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+	BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
+	BUG_ON(cpos + len >
+	       le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+
+	if (cpos == le64_to_cpu(rec->r_cpos) &&
+	    len == le32_to_cpu(rec->r_clusters))
+		ret = ocfs2_change_refcount_rec(handle, ci,
+						ref_leaf_bh, index, -1);
+	else {
+		struct ocfs2_refcount_rec split = *rec;
+		split.r_cpos = cpu_to_le64(cpos);
+		split.r_clusters = cpu_to_le32(len);
+
+		le32_add_cpu(&split.r_refcount, -1);
+
+		mlog(0, "split refcount rec, start %llu, "
+		     "len %u, count %u, original start %llu, len %u\n",
+		     (unsigned long long)le64_to_cpu(split.r_cpos),
+		     len, le32_to_cpu(split.r_refcount),
+		     (unsigned long long)le64_to_cpu(rec->r_cpos),
+		     le32_to_cpu(rec->r_clusters));
+		ret = ocfs2_split_refcount_rec(handle, ci,
+					       ref_root_bh, ref_leaf_bh,
+					       &split, index,
+					       meta_ac, dealloc);
+	}
+
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Remove the leaf refcount block if it contains no refcount record. */
+	if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
+		ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
+						   ref_leaf_bh, meta_ac,
+						   dealloc);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	return ret;
+}
+
+static int __ocfs2_decrease_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0, index = 0;
+	struct ocfs2_refcount_rec rec;
+	unsigned int r_count = 0, r_len;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	mlog(0, "Tree owner %llu, decrease refcount start %llu, len %u\n",
+	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+	     (unsigned long long)cpos, len);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		r_count = le32_to_cpu(rec.r_refcount);
+		BUG_ON(r_count == 0);
+
+		r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - cpos;
+
+		ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
+						  ref_leaf_bh, index,
+						  cpos, r_len,
+						  meta_ac, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (le32_to_cpu(rec.r_refcount) == 1) {
+			ret = ocfs2_cache_cluster_dealloc(dealloc,
+					  ocfs2_clusters_to_blocks(sb, cpos),
+							  r_len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += r_len;
+		len -= r_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+/* Caller must hold refcount tree lock. */
+int ocfs2_decrease_refcount(struct inode *inode,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 ref_blkno;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *tree;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_get_refcount_block(inode, &ref_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
+					cpos, len, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 3c38bbd..60ea02e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -40,4 +40,9 @@ int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
 				struct ocfs2_refcount_tree *tree,
 				int rw);
+
+int ocfs2_decrease_refcount(struct inode *inode,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 15/41] ocfs2: Add functions for extents refcounted.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (13 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 14/41] ocfs2: Add support of decrementing refcount for delete Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 16/41] ocfs2: Decrement refcount when truncating refcounted extents Tao Ma
                   ` (26 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Add function ocfs2_mark_extent_refcounted which can mark
an extent refcounted.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c        |   12 ++++++------
 fs/ocfs2/alloc.h        |    6 ++++++
 fs/ocfs2/ocfs2.h        |    7 +++++++
 fs/ocfs2/refcounttree.c |   39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9dd68cd..96f8ca6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5169,12 +5169,12 @@ out:
  *
  * The caller is responsible for passing down meta_ac if we'll need it.
  */
-static int ocfs2_change_extent_flag(handle_t *handle,
-				    struct ocfs2_extent_tree *et,
-				    u32 cpos, u32 len, u32 phys,
-				    struct ocfs2_alloc_context *meta_ac,
-				    struct ocfs2_cached_dealloc_ctxt *dealloc,
-				    int new_flags, int clear_flags)
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags)
 {
 	int ret, index;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0610ba1..19d5b88 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -128,6 +128,12 @@ int ocfs2_mark_extent_written(struct inode *inode,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags);
 int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len,
 			struct ocfs2_alloc_context *meta_ac,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index bb53573..eae4046 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -516,6 +516,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
 	ocfs2_set_links_count(di, links);
 }
 
+static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 1c6d425..bdc86c9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2151,3 +2151,42 @@ out:
 	brelse(ref_root_bh);
 	return ret;
 }
+
+/*
+ * Mark the already-existing extent at cpos as refcounted for len clusters.
+ * This adds the refcount extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+static int ocfs2_mark_extent_refcounted(struct inode *inode,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle, u32 cpos,
+				u32 len, u32 phys,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
+	     inode->i_ino, cpos, len, phys);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       OCFS2_EXT_REFCOUNTED, 0);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 16/41] ocfs2: Decrement refcount when truncating refcounted extents.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (14 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 15/41] ocfs2: Add functions for extents refcounted Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support Tao Ma
                   ` (25 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Add 'Decrement refcount for delete' in to the normal truncate
process. So for a refcounted extent record, call refcount rec
decrementation instead of cluster free.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c        |   76 +++++++++++++++--
 fs/ocfs2/journal.h      |    3 +
 fs/ocfs2/refcounttree.c |  212 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    6 ++
 4 files changed, 290 insertions(+), 7 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 96f8ca6..03438a6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,7 @@
 #include "super.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -6673,7 +6674,7 @@ out:
  */
 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
 			   handle_t *handle, struct ocfs2_truncate_context *tc,
-			   u32 clusters_to_del, u64 *delete_start)
+			   u32 clusters_to_del, u64 *delete_start, u8 *flags)
 {
 	int ret, i, index = path->p_tree_depth;
 	u32 new_edge = 0;
@@ -6683,6 +6684,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
 	struct ocfs2_extent_rec *rec;
 
 	*delete_start = 0;
+	*flags = 0;
 
 	while (index >= 0) {
 		bh = path->p_node[index].bh;
@@ -6770,6 +6772,7 @@ find_tail_record:
 			*delete_start = le64_to_cpu(rec->e_blkno)
 				+ ocfs2_clusters_to_blocks(inode->i_sb,
 					le16_to_cpu(rec->e_leaf_clusters));
+			*flags = rec->e_flags;
 
 			/*
 			 * If it's now empty, remove this record.
@@ -6869,7 +6872,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 			     struct buffer_head *fe_bh,
 			     handle_t *handle,
 			     struct ocfs2_truncate_context *tc,
-			     struct ocfs2_path *path)
+			     struct ocfs2_path *path,
+			     struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
 	struct ocfs2_dinode *fe;
@@ -6877,6 +6881,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	struct ocfs2_extent_list *el;
 	struct buffer_head *last_eb_bh = NULL;
 	u64 delete_blk = 0;
+	u8 rec_flags;
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
@@ -6932,7 +6937,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 
 	status = ocfs2_trim_tree(inode, path, handle, tc,
-				 clusters_to_del, &delete_blk);
+				 clusters_to_del, &delete_blk, &rec_flags);
 	if (status) {
 		mlog_errno(status);
 		goto bail;
@@ -6964,8 +6969,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	}
 
 	if (delete_blk) {
-		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-						   clusters_to_del);
+		if (rec_flags & OCFS2_EXT_REFCOUNTED)
+			status = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 delete_blk),
+					clusters_to_del, meta_ac,
+					&tc->tc_dealloc);
+		else
+			status = ocfs2_truncate_log_append(osb, handle,
+							   delete_blk,
+							   clusters_to_del);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -7383,11 +7396,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 {
 	int status, i, credits, tl_sem = 0;
 	u32 clusters_to_del, new_highest_cpos, range;
+	u64 blkno = 0;
 	struct ocfs2_extent_list *el;
 	handle_t *handle = NULL;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	mlog_entry_void();
 
@@ -7413,6 +7429,8 @@ start:
 		goto bail;
 	}
 
+	credits = 0;
+
 	/*
 	 * Truncate always works against the rightmost tree branch.
 	 */
@@ -7453,10 +7471,15 @@ start:
 		clusters_to_del = 0;
 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
 	} else if (range > new_highest_cpos) {
 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
 				  new_highest_cpos;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
+			ocfs2_clusters_to_blocks(inode->i_sb,
+				ocfs2_rec_clusters(el, &el->l_recs[i]) -
+				clusters_to_del);
 	} else {
 		status = 0;
 		goto bail;
@@ -7465,6 +7488,29 @@ start:
 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
 
+	if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
+		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+			 OCFS2_HAS_REFCOUNT_FL));
+
+		status = ocfs2_lock_refcount_tree(osb,
+						le64_to_cpu(di->i_refcount_loc),
+						1, &ref_tree, NULL);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
+							       blkno,
+							       clusters_to_del,
+							       &credits,
+							       &meta_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	mutex_lock(&tl_inode->i_mutex);
 	tl_sem = 1;
 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
@@ -7478,7 +7524,7 @@ start:
 		}
 	}
 
-	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+	credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
 						(struct ocfs2_dinode *)fe_bh->b_data,
 						el);
 	handle = ocfs2_start_trans(osb, credits);
@@ -7490,7 +7536,7 @@ start:
 	}
 
 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-				   tc, path);
+				   tc, path, meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -7504,6 +7550,16 @@ start:
 
 	ocfs2_reinit_path(path, 1);
 
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+
+	if (ref_tree) {
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+		ref_tree = NULL;
+	}
+
 	/*
 	 * The check above will catch the case where we've truncated
 	 * away all allocation.
@@ -7520,6 +7576,12 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
 
 	ocfs2_free_path(path);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index bd88c8b..3f74e09 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -504,6 +504,9 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
  */
 #define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
 
+/* 2 metadata alloc, 2 new blocks and root refcount block */
+#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
+
 /*
  * Please note that the caller must make sure that root_el is the root
  * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index bdc86c9..6dee650 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2190,3 +2190,215 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
 out:
 	return ret;
 }
+
+/*
+ * Given some contiguous physical clusters, calculate what we need
+ * for modifying their refcount.
+ */
+static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
+					    struct ocfs2_caching_info *ci,
+					    struct buffer_head *ref_root_bh,
+					    u64 start_cpos,
+					    u32 clusters,
+					    int *meta_add,
+					    int *credits)
+{
+	int ret = 0, index, ref_blocks = 0, recs_add = 0;
+	u64 cpos = start_cpos;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
+	u32 len;
+
+	mlog(0, "start_cpos %llu, clusters %u\n",
+	     (unsigned long long)start_cpos, clusters);
+	while (clusters) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, clusters, &rec,
+					     &index, &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (ref_leaf_bh != prev_bh) {
+			/*
+			 * Now we encounter a new leaf block, so calculate
+			 * whether we need to extend the old leaf.
+			 */
+			if (prev_bh) {
+				rb = (struct ocfs2_refcount_block *)
+							prev_bh->b_data;
+
+				if (le64_to_cpu(rb->rf_records.rl_used) +
+				    recs_add >
+				    le16_to_cpu(rb->rf_records.rl_count))
+					ref_blocks++;
+			}
+
+			recs_add = 0;
+			*credits += 1;
+			brelse(prev_bh);
+			prev_bh = ref_leaf_bh;
+			get_bh(prev_bh);
+		}
+
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+		mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
+		     "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
+		     recs_add, (unsigned long long)cpos, clusters,
+		     (unsigned long long)le64_to_cpu(rec.r_cpos),
+		     le32_to_cpu(rec.r_clusters),
+		     le32_to_cpu(rec.r_refcount), index);
+
+		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
+			  le32_to_cpu(rec.r_clusters)) - cpos;
+		/*
+		 * If the refcount rec already exist, cool. We just need
+		 * to check whether there is a split. Otherwise we just need
+		 * to increase the refcount.
+		 * If we will insert one, increases recs_add.
+		 *
+		 * We record all the records which will be inserted to the
+		 * same refcount block, so that we can tell exactly whether
+		 * we need a new refcount block or not.
+		 */
+		if (rec.r_refcount) {
+			/* Check whether we need a split at the beginning. */
+			if (cpos == start_cpos &&
+			    cpos != le64_to_cpu(rec.r_cpos))
+				recs_add++;
+
+			/* Check whether we need a split in the end. */
+			if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
+			    le32_to_cpu(rec.r_clusters))
+				recs_add++;
+		} else
+			recs_add++;
+
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+		clusters -= len;
+		cpos += len;
+	}
+
+	if (prev_bh) {
+		rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
+
+		if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
+		    le16_to_cpu(rb->rf_records.rl_count))
+			ref_blocks++;
+
+		*credits += 1;
+	}
+
+	if (!ref_blocks)
+		goto out;
+
+	mlog(0, "we need ref_blocks %d\n", ref_blocks);
+	*meta_add += ref_blocks;
+	*credits += ref_blocks;
+
+	/*
+	 * So we may need ref_blocks to insert into the tree.
+	 * That also means we need to change the b-tree and add that number
+	 * of records since we never merge them.
+	 * We need one more block for expansion since the new created leaf
+	 * block is also full and needs split.
+	 */
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+		*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
+		*credits += ocfs2_calc_extend_credits(sb,
+						      et.et_root_el,
+						      ref_blocks);
+	} else {
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+		*meta_add += 1;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	brelse(prev_bh);
+	return ret;
+}
+
+/*
+ * For refcount tree, we will decrease some contiguous clusters
+ * refcount count, so just go through it to see how many blocks
+ * we gonna touch and whether we need to create new blocks.
+ *
+ * Normally the refcount blocks store these refcount should be
+ * continguous also, so that we can get the number easily.
+ * As for meta_ac, we will at most add split 2 refcount record and
+ * 2 more refcount block, so just check it in a rough way.
+ *
+ * Caller must hold refcount tree lock.
+ */
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  struct buffer_head *di_bh,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, ref_blocks = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *tree;
+	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+				      le64_to_cpu(di->i_refcount_loc), &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci,
+					le64_to_cpu(di->i_refcount_loc),
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       &tree->rf_ci,
+					       ref_root_bh,
+					       start_cpos, clusters,
+					       &ref_blocks, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "reserve new metadata %d, credits = %d\n",
+	     ref_blocks, *credits);
+
+	if (ref_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+							ref_blocks, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 60ea02e..bd2b1ee 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -45,4 +45,10 @@ int ocfs2_decrease_refcount(struct inode *inode,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  struct buffer_head *di_bh,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  struct ocfs2_alloc_context **meta_ac);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (15 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 16/41] ocfs2: Decrement refcount when truncating refcounted extents Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-21  0:59   ` Joel Becker
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 18/41] ocfs2: CoW refcount tree improvement Tao Ma
                   ` (24 subsequent siblings)
  41 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

This patch try CoW support for a refcounted record.

the whole process will be:
1. Calculate how much clusters we need to CoW and where we start.
   Currently we will CoW 1MB at most.
2. Do CoW for the clusters with the help of page cache.
3. Change the b-tree structure with the new allocated clusters.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c        |   25 ++-
 fs/ocfs2/alloc.h        |    5 +
 fs/ocfs2/aops.c         |    4 +-
 fs/ocfs2/aops.h         |    2 +
 fs/ocfs2/refcounttree.c |  677 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    2 +
 6 files changed, 704 insertions(+), 11 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 03438a6..b8fc95d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6998,9 +6998,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
-				     unsigned int from, unsigned int to,
-				     struct page *page, int zero, u64 *phys)
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys)
 {
 	int ret, partial = 0;
 
@@ -7068,20 +7068,16 @@ out:
 		ocfs2_unlock_and_free_pages(pages, numpages);
 }
 
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
-				struct page **pages, int *num)
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num)
 {
 	int numpages, ret = 0;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
 	loff_t last_page_bytes;
 
 	BUG_ON(start > end);
 
-	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
-	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
-
 	numpages = 0;
 	last_page_bytes = PAGE_ALIGN(end);
 	index = start >> PAGE_CACHE_SHIFT;
@@ -7109,6 +7105,17 @@ out:
 	return ret;
 }
 
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+				struct page **pages, int *num)
+{
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+	return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
 /*
  * Zero the area past i_size but still within an allocated
  * cluster. This avoids exposing nonzero data on subsequent file
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 19d5b88..9c122d5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -271,6 +271,11 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 	return !rec->e_leaf_clusters;
 }
 
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys);
 /*
  * Structures which describe a path through a btree, and functions to
  * manipulate them.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7208ea9..65e17b3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -126,8 +126,8 @@ bail:
 	return err;
 }
 
-static int ocfs2_get_block(struct inode *inode, sector_t iblock,
-			   struct buffer_head *bh_result, int create)
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
 {
 	int err = 0;
 	unsigned int ext_flags;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e492..c48e93f 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 			   struct buffer_head *di_bh);
 int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
 
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 6dee650..5adcd6a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -31,6 +31,27 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "extent_map.h"
+#include "aops.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+struct ocfs2_cow_context {
+	struct inode *inode;
+	u32 cow_start;
+	u32 cow_len;
+	struct ocfs2_extent_tree di_et;
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+};
 
 static inline struct ocfs2_refcount_tree *
 cache_info_to_refcount(struct ocfs2_caching_info *ci)
@@ -2402,3 +2423,659 @@ out:
 	brelse(ref_root_bh);
 	return ret;
 }
+
+#define	MAX_COW_BYTES	1048576
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * And We will try to Cow as much clusters as we can until we reach
+ * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
+ * use that value as the maximum clusters.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+					   struct buffer_head *di_bh,
+					   u32 cpos,
+					   u32 write_len,
+					   u32 *cow_start,
+					   u32 *cow_len)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+	int tree_height = le16_to_cpu(el->l_tree_depth), i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct ocfs2_extent_rec *rec;
+	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
+	int leaf_clusters, rec_end = 0;
+
+	max_clusters = max_clusters < write_len ? write_len : max_clusters;
+	if (tree_height > 0) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	*cow_len = 0;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		if (ocfs2_is_empty_extent(rec)) {
+			mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+					"index %d\n", inode->i_ino, i);
+			continue;
+		}
+
+		if (le32_to_cpu(rec->e_cpos) +
+		    le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+			continue;
+
+		if (*cow_len == 0) {
+			/*
+			 * We should find a refcounted record in the
+			 * first pass.
+			 */
+			BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+			*cow_start = le32_to_cpu(rec->e_cpos);
+		}
+
+		/*
+		 * If we encounter a hole or a non-refcounted record,
+		 * stop the search.
+		 */
+		if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+		    (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)))
+			break;
+
+		leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+		rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+
+		if (*cow_len + leaf_clusters >= max_clusters) {
+			if (*cow_len == 0) {
+				/*
+				 * cpos is in a very large extent record.
+				 * So just split max_clusters from the
+				 * extent record.
+				 */
+				if ((rec_end - cpos) <= max_clusters) {
+					/*
+					 * We can take max_clusters off
+					 * the end and cover all of our
+					 * write.
+					 */
+					*cow_start = rec_end - max_clusters;
+				} else if ((*cow_start + max_clusters) >
+					   (cpos + write_len)) {
+					/*
+					 * We can take max_clusters off
+					 * the front and cover all of
+					 * our write.
+					 */
+					/* NOOP, *cow_start is already set */
+				} else {
+					/*
+					 * We're CoWing more data than
+					 * write_len for contiguousness,
+					 * but it doesn't fit at the
+					 * front or end of this extent.
+					 * Let's try to slice the extent
+					 * up nicely.  Optimally, our
+					 * CoW region starts at a
+					 * multiple of max_clusters.  If
+					 * that doesn't fit, we give up
+					 * and just CoW at cpos.
+					 */
+					*cow_start +=
+						(cpos - *cow_start) &
+							~(max_clusters - 1);
+					if ((*cow_start + max_clusters) <
+					    (cpos + write_len))
+						*cow_start = cpos;
+				}
+			}
+			*cow_len = max_clusters;
+			break;
+		} else
+			*cow_len += leaf_clusters;
+
+		/*
+		 * If we reach the end of the extent block and don't get enough
+		 * clusters, continue with the next extent block if possible.
+		 */
+		if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
+		    eb && eb->h_next_leaf_blk) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+
+			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+					       le64_to_cpu(eb->h_next_leaf_blk),
+					       &eb_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+			el = &eb->h_list;
+			i = -1;
+		}
+	}
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ *    more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ *    just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+					u32 p_cluster, u32 num_clusters,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int *credits)
+{
+	int ret = 0, meta_add = 0;
+	int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < num_clusters + 2)
+		meta_add =
+			ocfs2_extend_meta_needed(et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
+					      num_clusters + 2);
+
+	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
+	     meta_add, num_clusters, *credits);
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+						meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+					     data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUG_ON(buffer_dirty(bh));
+
+	clear_buffer_mapped(bh);
+
+	return 0;
+}
+
+static int ocfs2_duplicate_clusters(handle_t *handle,
+				    struct ocfs2_cow_context *context,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len)
+{
+	int ret = 0, partial;
+	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct page *page;
+	pgoff_t page_index;
+	unsigned int from, to;
+	loff_t offset, end, map_end;
+	struct address_space *mapping = context->inode->i_mapping;
+
+	mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
+	     new_cluster, new_len, cpos);
+
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		/* from, to is the offset within the page. */
+		from = offset & (PAGE_CACHE_SIZE - 1);
+		to = PAGE_CACHE_SIZE;
+		if (map_end & (PAGE_CACHE_SIZE - 1))
+			to = map_end & (PAGE_CACHE_SIZE - 1);
+
+		page = grab_cache_page(mapping, page_index);
+
+		/* This page can't be dirtied before we CoW it out. */
+		BUG_ON(PageDirty(page));
+
+		if (!PageUptodate(page)) {
+			ret = block_read_full_page(page, ocfs2_get_block);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+			lock_page(page);
+		}
+
+		if (page_has_buffers(page)) {
+			ret = walk_page_buffers(handle, page_buffers(page),
+						from, to, &partial,
+						ocfs2_clear_cow_buffer);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+		}
+
+		ocfs2_map_and_dirty_page(context->inode,
+					 handle, from, to,
+					 page, 0, &new_block);
+		mark_page_accessed(page);
+unlock:
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, u32 p_cluster, u32 len,
+				    unsigned int ext_flags,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	struct ocfs2_extent_rec replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+	mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
+	     (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+								   p_cluster));
+	replace_rec.e_flags = ext_flags;
+	replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, et, path, index,
+				 &replace_rec, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+				  struct ocfs2_cow_context *context,
+				  u32 cpos, u32 old,
+				  u32 new, u32 len,
+				  unsigned int ext_flags)
+{
+	int ret;
+	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	u64 ino = ocfs2_metadata_cache_owner(ci);
+
+	mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
+	     (unsigned long long)ino, cpos, old, new, len, ext_flags);
+
+	/*If the old clusters is unwritten, no need to duplicate. */
+	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+		ret = ocfs2_duplicate_clusters(handle, context, cpos,
+					       old, new, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+				       cpos, new, len, ext_flags,
+				       context->meta_ac, &context->dealloc);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static int ocfs2_cow_sync_writeback(struct super_block *sb,
+				    struct ocfs2_cow_context *context,
+				    u32 cpos, u32 num_clusters)
+{
+	int ret = 0;
+	loff_t offset, end, map_end;
+	pgoff_t page_index;
+	struct page *page;
+
+	if (ocfs2_should_order_data(context->inode))
+		return 0;
+
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+
+	ret = filemap_fdatawrite_range(context->inode->i_mapping,
+				       offset, end - 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		page = grab_cache_page(context->inode->i_mapping, page_index);
+		BUG_ON(!page);
+
+		wait_on_page_writeback(page);
+		if (PageError(page)) {
+			ret = -EIO;
+			mlog_errno(ret);
+		} else
+			mark_page_accessed(page);
+
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+					struct ocfs2_cow_context *context,
+					u32 cpos, u32 p_cluster,
+					u32 num_clusters, unsigned int e_flags)
+{
+	int ret, credits =  0;
+	u32 new_bit, new_len;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	handle_t *handle;
+
+	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+					     &context->di_et,
+					     context->ref_ci,
+					     context->ref_root_bh,
+					     &context->meta_ac,
+					     &context->data_ac, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (num_clusters) {
+		ret = __ocfs2_claim_clusters(osb, handle, context->data_ac,
+					     1, num_clusters,
+					     &new_bit, &new_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_replace_clusters(handle, context,
+					     cpos, p_cluster, new_bit,
+					     new_len, e_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		cpos += new_len;
+		p_cluster += new_len;
+		num_clusters -= new_len;
+	}
+
+	ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
+					context->ref_root_bh,
+					p_cluster, num_clusters,
+					context->meta_ac,
+					&context->dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+	return ret;
+}
+
+static int ocfs2_replace_cow(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     struct buffer_head *ref_root_bh,
+			     struct ocfs2_caching_info *ref_ci,
+			     u32 cow_start, u32 cow_len)
+{
+	int ret;
+	u32 p_cluster, num_clusters, start = cow_start;
+	unsigned int ext_flags;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cow_context *context;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		return -EROFS;
+	}
+
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_ci = ref_ci;
+	context->ref_root_bh = ref_root_bh;
+
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
+	ocfs2_init_dinode_extent_tree(&context->di_et,
+				      INODE_CACHE(inode), di_bh);
+
+	while (cow_len) {
+		ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+		if (cow_len < num_clusters)
+			num_clusters = cow_len;
+
+		ret = ocfs2_make_clusters_writable(inode->i_sb, context,
+						   cow_start, p_cluster,
+						   num_clusters, ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		cow_len -= num_clusters;
+		cow_start += num_clusters;
+	}
+
+
+	/*
+	 * truncate the extent map here since no matter whether we meet with
+	 * any error during the action, we shouldn't trust cached extent map
+	 * any more.
+	 */
+	ocfs2_extent_map_trunc(inode, start);
+
+	if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &context->dealloc);
+	}
+
+	kfree(context);
+	return ret;
+}
+
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len)
+{
+	int ret;
+	u32 cow_start = 0, cow_len = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len,
+					      &cow_start, &cow_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
+	     "cow_len %u\n", inode->i_ino,
+	     cpos, write_len, cow_start, cow_len);
+
+	BUG_ON(cow_len == 0);
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh, &ref_tree->rf_ci,
+				cow_start, cow_len);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index bd2b1ee..949ff45 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -51,4 +51,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  u32 clusters,
 					  int *credits,
 					  struct ocfs2_alloc_context **meta_ac);
+int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 18/41] ocfs2: CoW refcount tree improvement.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (16 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write Tao Ma
                   ` (23 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

During CoW, if the old extent record is refcounted, we allocate
som new clusters and do CoW. Actually we can have some improvement
here. If the old extent has refcount=1, that means now it is only
used by this file. So we don't need to allocate new clusters, just
remove the refcounted flag and it is OK. We also have to remove
it from the refcount tree while not deleting it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c        |    2 +-
 fs/ocfs2/refcounttree.c |  104 +++++++++++++++++++++++++++++++++++------------
 fs/ocfs2/refcounttree.h |    3 +-
 3 files changed, 81 insertions(+), 28 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8fc95d..7c879fc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6974,7 +6974,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 					ocfs2_blocks_to_clusters(osb->sb,
 								 delete_blk),
 					clusters_to_del, meta_ac,
-					&tc->tc_dealloc);
+					&tc->tc_dealloc, 1);
 		else
 			status = ocfs2_truncate_log_append(osb, handle,
 							   delete_blk,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5adcd6a..a3aeaea 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2074,7 +2074,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 				     struct buffer_head *ref_root_bh,
 				     u64 cpos, u32 len,
 				     struct ocfs2_alloc_context *meta_ac,
-				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     int delete)
 {
 	int ret = 0, index = 0;
 	struct ocfs2_refcount_rec rec;
@@ -2082,9 +2083,10 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	struct buffer_head *ref_leaf_bh = NULL;
 
-	mlog(0, "Tree owner %llu, decrease refcount start %llu, len %u\n",
+	mlog(0, "Tree owner %llu, decrease refcount start %llu, "
+	     "len %u, delete %u\n",
 	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)cpos, len);
+	     (unsigned long long)cpos, len, delete);
 
 	while (len) {
 		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
@@ -2097,6 +2099,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 
 		r_count = le32_to_cpu(rec.r_refcount);
 		BUG_ON(r_count == 0);
+		if (!delete)
+			BUG_ON(r_count > 1);
 
 		r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
 			      le32_to_cpu(rec.r_clusters)) - cpos;
@@ -2110,7 +2114,7 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 			goto out;
 		}
 
-		if (le32_to_cpu(rec.r_refcount) == 1) {
+		if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
 			ret = ocfs2_cache_cluster_dealloc(dealloc,
 					  ocfs2_clusters_to_blocks(sb, cpos),
 							  r_len);
@@ -2135,7 +2139,8 @@ out:
 int ocfs2_decrease_refcount(struct inode *inode,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
-			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete)
 {
 	int ret;
 	u64 ref_blkno;
@@ -2165,7 +2170,7 @@ int ocfs2_decrease_refcount(struct inode *inode,
 	}
 
 	ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
-					cpos, len, meta_ac, dealloc);
+					cpos, len, meta_ac, dealloc, delete);
 	if (ret)
 		mlog_errno(ret);
 out:
@@ -2883,10 +2888,16 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 					u32 cpos, u32 p_cluster,
 					u32 num_clusters, unsigned int e_flags)
 {
-	int ret, credits =  0;
+	int ret, delete, index, credits =  0;
 	u32 new_bit, new_len;
+	unsigned int set_len;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	handle_t *handle;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_refcount_rec rec;
+
+	mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
+	     cpos, p_cluster, num_clusters, e_flags);
 
 	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
 					     &context->di_et,
@@ -2907,35 +2918,75 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	}
 
 	while (num_clusters) {
-		ret = __ocfs2_claim_clusters(osb, handle, context->data_ac,
-					     1, num_clusters,
-					     &new_bit, &new_len);
+		ret = ocfs2_get_refcount_rec(context->ref_ci,
+					     context->ref_root_bh,
+					     p_cluster, num_clusters,
+					     &rec, &index, &ref_leaf_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
 		}
 
-		ret = ocfs2_replace_clusters(handle, context,
-					     cpos, p_cluster, new_bit,
-					     new_len, e_flags);
+		BUG_ON(!rec.r_refcount);
+		set_len = min((u64)p_cluster + num_clusters,
+			      le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - p_cluster;
+
+		/*
+		 * There are many different situation here.
+		 * 1. If refcount == 1, remove the flag and don't COW.
+		 * 2. If refcount > 1, allocate clusters.
+		 *    Here we may not allocate r_len once at a time, so continue
+		 *    until we reach num_clusters.
+		 */
+		if (le32_to_cpu(rec.r_refcount) == 1) {
+			delete = 0;
+			ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+						       cpos, p_cluster,
+						       set_len, e_flags,
+						       context->meta_ac,
+						       &context->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+		} else {
+			delete = 1;
+
+			ret = __ocfs2_claim_clusters(osb, handle,
+						     context->data_ac,
+						     1, set_len,
+						     &new_bit, &new_len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			ret = ocfs2_replace_clusters(handle, context,
+						     cpos, p_cluster, new_bit,
+						     new_len, e_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+			set_len = new_len;
+		}
+
+		ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
+						context->ref_root_bh,
+						p_cluster, set_len,
+						context->meta_ac,
+						&context->dealloc, delete);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
 		}
 
-		cpos += new_len;
-		p_cluster += new_len;
-		num_clusters -= new_len;
-	}
-
-	ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
-					context->ref_root_bh,
-					p_cluster, num_clusters,
-					context->meta_ac,
-					&context->dealloc);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
+		cpos += set_len;
+		p_cluster += set_len;
+		num_clusters -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
 	}
 
 	/*
@@ -2958,6 +3009,7 @@ out:
 		ocfs2_free_alloc_context(context->meta_ac);
 		context->meta_ac = NULL;
 	}
+	brelse(ref_leaf_bh);
 
 	return ret;
 }
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 949ff45..bc40af1 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -44,7 +44,8 @@ void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
 int ocfs2_decrease_refcount(struct inode *inode,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
-			    struct ocfs2_cached_dealloc_ctxt *dealloc);
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  struct buffer_head *di_bh,
 					  u64 phys_blkno,
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (17 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 18/41] ocfs2: CoW refcount tree improvement Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-21  1:04   ` Joel Becker
  2009-08-21 21:12   ` Joel Becker
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 20/41] ocfs2: CoW a reflinked cluster when it is truncated Tao Ma
                   ` (22 subsequent siblings)
  41 siblings, 2 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In ocfs2_write_begin_nolock, when we find the cluster we
want to write is a reflinked cluster, do CoW first.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/aops.c |   49 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 65e17b3..02244bb 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -1411,18 +1412,29 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 	}
 }
 
+static inline void ocfs2_clear_write_desc(struct ocfs2_write_ctxt *wc)
+{
+	memset(&wc->w_desc, 0,
+	       sizeof(struct ocfs2_write_cluster_desc) * wc->w_clen);
+}
 /*
  * Populate each single-cluster write descriptor in the write context
  * with information about the i/o to be done.
+ * If we encountered a refcounted cluster, break the process and return
+ * the refcounted start cpos.
  *
  * Returns the number of clusters that will have to be allocated, as
  * well as a worst case estimate of the number of extent records that
  * would have to be created during a write to an unwritten region.
+ *
+ * If we find a refcounted record, return directly with refcounted_cpos
+ * set as the position.
  */
 static int ocfs2_populate_write_desc(struct inode *inode,
 				     struct ocfs2_write_ctxt *wc,
 				     unsigned int *clusters_to_alloc,
-				     unsigned int *extents_to_split)
+				     unsigned int *extents_to_split,
+				     unsigned int *refcounted_cpos)
 {
 	int ret;
 	struct ocfs2_write_cluster_desc *desc;
@@ -1433,6 +1445,7 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 
 	*clusters_to_alloc = 0;
 	*extents_to_split = 0;
+	*refcounted_cpos = UINT_MAX;
 
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
@@ -1449,6 +1462,11 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 				goto out;
 			}
 
+			if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+				*refcounted_cpos = desc->c_cpos;
+				ret = -ETXTBSY;
+				goto out;
+			}
 			/*
 			 * Assume worst case - that we're writing in
 			 * the middle of the extent.
@@ -1674,6 +1692,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
+	unsigned int refcounted_cpos, cow_len;
 
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
@@ -1701,12 +1720,36 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	}
 
 	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
-					&extents_to_split);
-	if (ret) {
+					&extents_to_split, &refcounted_cpos);
+	if (ret && ret != -ETXTBSY) {
 		mlog_errno(ret);
 		goto out;
 	}
 
+	if (ret == -ETXTBSY) {
+		BUG_ON(refcounted_cpos == UINT_MAX);
+		cow_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
+
+		ret = ocfs2_refcount_cow(inode, di_bh,
+					 refcounted_cpos, cow_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* reinitialize write_desc and populate it again. */
+		ocfs2_clear_write_desc(wc);
+		ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+						&extents_to_split,
+						&refcounted_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(refcounted_cpos != UINT_MAX);
+	}
+
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
 	/*
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 20/41] ocfs2: CoW a reflinked cluster when it is truncated.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (18 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 21/41] ocfs2: Add normal functions for reflink a normal file's extents Tao Ma
                   ` (21 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

When we truncate a file to a specific size which resides in a reflinked
cluster, we need to CoW it since ocfs2_zero_range_for_truncate will
zero the space after the size(just another type of write).

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/aops.c         |    2 +-
 fs/ocfs2/file.c         |   45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.c |   12 ++++++++++--
 fs/ocfs2/refcounttree.h |    2 +-
 4 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 02244bb..c94febc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1731,7 +1731,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		cow_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
 
 		ret = ocfs2_refcount_cow(inode, di_bh,
-					 refcounted_cpos, cow_len);
+					 refcounted_cpos, cow_len, UINT_MAX);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4921b4e..1254d72 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -334,6 +335,39 @@ out:
 	return ret;
 }
 
+static int ocfs2_cow_file_pos(struct inode *inode,
+			      struct buffer_head *fe_bh,
+			      u64 offset)
+{
+	int status;
+	u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
+
+	/*
+	 * If the new offset is aligned to the range of the cluster, there is
+	 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
+	 * CoW either.
+	 */
+	if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
+		return 0;
+
+	status = ocfs2_get_clusters(inode, cpos, &phys,
+				    &num_clusters, &ext_flags);
+	if (status) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+
+out:
+	return status;
+}
+
 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 				     struct inode *inode,
 				     struct buffer_head *fe_bh,
@@ -346,6 +380,17 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
+	/*
+	 * We need to CoW the cluster contains the offset if it is reflinked
+	 * since we will call ocfs2_zero_range_for_truncate later which will
+	 * write "0" from offset to the end of the cluster.
+	 */
+	status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
+	if (status) {
+		mlog_errno(status);
+		return status;
+	}
+
 	/* TODO: This needs to actually orphan the inode in this
 	 * transaction. */
 
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a3aeaea..0231f17 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2435,6 +2435,7 @@ out:
  *
  * cpos is vitual start cluster position we want to do CoW in a
  * file and write_len is the cluster length.
+ * max_cpos is the place where we want to stop CoW intentionally.
  *
  * Normal we will start CoW from the beginning of extent record cotaining cpos.
  * And We will try to Cow as much clusters as we can until we reach
@@ -2445,6 +2446,7 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 					   struct buffer_head *di_bh,
 					   u32 cpos,
 					   u32 write_len,
+					   u32 max_cpos,
 					   u32 *cow_start,
 					   u32 *cow_len)
 {
@@ -2512,6 +2514,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 
 		leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
 		rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+		if (rec_end > max_cpos) {
+			rec_end = max_cpos;
+			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
+		}
 
 		if (*cow_len + leaf_clusters >= max_clusters) {
 			if (*cow_len == 0) {
@@ -3090,7 +3096,7 @@ static int ocfs2_replace_cow(struct inode *inode,
 
 int ocfs2_refcount_cow(struct inode *inode,
 		       struct buffer_head *di_bh,
-		       u32 cpos, u32 write_len)
+		       u32 cpos, u32 write_len, u32 max_cpos)
 {
 	int ret;
 	u32 cow_start = 0, cow_len = 0;
@@ -3102,12 +3108,14 @@ int ocfs2_refcount_cow(struct inode *inode,
 
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 
-	ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len,
+	ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh,
+					      cpos, write_len, max_cpos,
 					      &cow_start, &cow_len);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
+
 	mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
 	     "cow_len %u\n", inode->i_ino,
 	     cpos, write_len, cow_start, cow_len);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index bc40af1..5539211 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -53,5 +53,5 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  int *credits,
 					  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
-		       u32 cpos, u32 write_len);
+		       u32 cpos, u32 write_len, u32 max_cpos);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 21/41] ocfs2: Add normal functions for reflink a normal file's extents.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (19 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 20/41] ocfs2: CoW a reflinked cluster when it is truncated Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 22/41] ocfs2: handle file attributes issue for reflink Tao Ma
                   ` (20 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

2 major functions are added in this patch.

ocfs2_attach_refcount_tree will create a new refcount tree to the
old file if it doesn't have one and insert all the extent records
to the tree if they are not refcounted.

ocfs2_create_reflink_node will:
1. set the refcount tree to the new file.
2. call ocfs2_duplicate_extent_list which will iterate all the
   extents for the old file, insert it to the new file and increase
   the corresponding referennce count.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |  286 +++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 286 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 0231f17..ec07518 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3139,3 +3139,289 @@ int ocfs2_refcount_cow(struct inode *inode,
 out:
 	return ret;
 }
+
+/*
+ * Insert a new extent into refcount tree and mark a extent rec
+ * as refcounted in the dinode tree.
+ */
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 1, ref_blocks = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &ref_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "reserve new metadata %d, credits = %d\n",
+	     ref_blocks, credits);
+
+	if (ref_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+							ref_blocks, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
+					   cpos, num_clusters, p_cluster,
+					   meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters,
+					meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_attach_refcount_tree(struct inode *inode,
+				      struct buffer_head *di_bh)
+{
+	int ret;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_tree *ref_tree;
+	unsigned int ext_flags;
+	loff_t size;
+	u32 cpos, num_clusters, clusters, p_cluster;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree di_et;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+		ret = ocfs2_create_refcount_tree(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	BUG_ON(!di->i_refcount_loc);
+	ret = ocfs2_lock_refcount_tree(osb,
+				       le64_to_cpu(di->i_refcount_loc), 1,
+				       &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
+
+	size = i_size_read(inode);
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = ocfs2_add_refcount_flag(inode, &di_et,
+						      &ref_tree->rf_ci,
+						      ref_root_bh, cpos,
+						      p_cluster, num_clusters,
+						      &dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+		cpos += num_clusters;
+	}
+
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+
+	if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+out:
+	/*
+	 * Empty the extent map so that we may get the right extent
+	 * record from the disk.
+	 */
+	ocfs2_extent_map_trunc(inode, 0);
+
+	return ret;
+}
+
+static int ocfs2_add_refcounted_extent(struct inode *inode,
+				   struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ref_ci,
+				   struct buffer_head *ref_root_bh,
+				   u32 cpos, u32 p_cluster, u32 num_clusters,
+				   unsigned int ext_flags,
+				   struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_refcount_allocators(inode->i_sb,
+					     p_cluster, num_clusters,
+					     et, ref_ci,
+					     ref_root_bh, &meta_ac,
+					     NULL, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_insert_extent(handle, et, cpos,
+			cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+							     p_cluster)),
+			num_clusters, ext_flags, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters,
+					meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_duplicate_extent_list(struct inode *s_inode,
+				struct inode *t_inode,
+				struct buffer_head *t_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters, clusters, cpos;
+	loff_t size;
+	unsigned int ext_flags;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
+
+	size = i_size_read(s_inode);
+	clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		if (p_cluster) {
+			ret = ocfs2_add_refcounted_extent(t_inode, &et,
+							  ref_ci, ref_root_bh,
+							  cpos, p_cluster,
+							  num_clusters,
+							  ext_flags,
+							  dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += num_clusters;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_create_reflink_node(struct inode *s_inode,
+				     struct buffer_head *s_bh,
+				     struct inode *t_inode,
+				     struct buffer_head *t_bh)
+{
+	int ret;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+				      le64_to_cpu(di->i_refcount_loc));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
+					  &ref_tree->rf_ci, ref_root_bh,
+					  &dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+out:
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+
+	return ret;
+}
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 22/41] ocfs2: handle file attributes issue for reflink.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (20 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 21/41] ocfs2: Add normal functions for reflink a normal file's extents Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 23/41] ocfs2: Return extent flags for xattr value tree Tao Ma
                   ` (19 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

A reflink creates a snapshot of a file, that means the attributes
must be identical except for three exceptions - nlink, ino, and ctime.

As for time changes, Here is a brief description:

1. Source file:
   1) atime: Ignore. Let the lazy atime code handle that.
   2) mtime: don't touch.
   3) ctime: If we change the tree (adding REFCOUNTED to at least one
             extent), update it.
2. Destination file:
   1) atime: ignore.
   2) mtime: we want it to appear identical to the source.
   3) ctime: update.

The idea here is that an ls -l will show the same time for the
src and target - it shows mtime.  Backup software like rsync and tar
will treat the new file correctly too.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |  122 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ec07518..efcccb4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3207,10 +3207,44 @@ out:
 	return ret;
 }
 
+static int ocfs2_change_ctime(struct inode *inode,
+			      struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
 static int ocfs2_attach_refcount_tree(struct inode *inode,
 				      struct buffer_head *di_bh)
 {
-	int ret;
+	int ret, data_changed = 0;
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -3259,12 +3293,21 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 						      &dealloc);
 			if (ret) {
 				mlog_errno(ret);
-				break;
+				goto unlock;
 			}
+
+			data_changed = 1;
 		}
 		cpos += num_clusters;
 	}
 
+	if (data_changed) {
+		ret = ocfs2_change_ctime(inode, di_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+unlock:
 	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
 
@@ -3379,6 +3422,74 @@ out:
 	return ret;
 }
 
+/*
+ * change the new file's attributes to the src.
+ *
+ * reflink creates a snapshot of a file, that means the attributes
+ * must be identical except for three exceptions - nlink, ino, and ctime.
+ */
+static int ocfs2_complete_reflink(struct inode *s_inode,
+				  struct buffer_head *s_bh,
+				  struct inode *t_inode,
+				  struct buffer_head *t_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
+	loff_t size = i_size_read(s_inode);
+
+	handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&OCFS2_I(t_inode)->ip_lock);
+	OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
+	OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
+	OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
+	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+	i_size_write(t_inode, size);
+
+	di->i_xattr_inline_size = s_di->i_xattr_inline_size;
+	di->i_clusters = s_di->i_clusters;
+	di->i_size = s_di->i_size;
+	di->i_dyn_features = s_di->i_dyn_features;
+	di->i_attr = s_di->i_attr;
+	di->i_uid = s_di->i_uid;
+	di->i_gid = s_di->i_gid;
+	di->i_mode = s_di->i_mode;
+
+	/*
+	 * update time.
+	 * we want mtime to appear identical to the source and update ctime.
+	 */
+	t_inode->i_ctime = CURRENT_TIME;
+
+	di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+
+	t_inode->i_mtime = s_inode->i_mtime;
+	di->i_mtime = s_di->i_mtime;
+	di->i_mtime_nsec = s_di->i_mtime_nsec;
+
+	ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
+	return ret;
+}
+
 static int ocfs2_create_reflink_node(struct inode *s_inode,
 				     struct buffer_head *s_bh,
 				     struct inode *t_inode,
@@ -3412,9 +3523,16 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
 	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
 					  &ref_tree->rf_ci, ref_root_bh,
 					  &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock_refcount;
+	}
+
+	ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh);
 	if (ret)
 		mlog_errno(ret);
 
+out_unlock_refcount:
 	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
 out:
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 23/41] ocfs2: Return extent flags for xattr value tree.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (21 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 22/41] ocfs2: handle file attributes issue for reflink Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 24/41] ocfs2: Abstract duplicate clusters process in CoW Tao Ma
                   ` (18 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

With the new refcount tree, xattr value can also be refcounted
among multiple files. So return the appropriate extent flags
so that CoW can used it later.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/extent_map.c |    6 +++++-
 fs/ocfs2/extent_map.h |    3 ++-
 fs/ocfs2/xattr.c      |    7 ++++---
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 40b5105..843db64 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -541,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
 
 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 			     u32 *p_cluster, u32 *num_clusters,
-			     struct ocfs2_extent_list *el)
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags)
 {
 	int ret = 0, i;
 	struct buffer_head *eb_bh = NULL;
@@ -593,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 		*p_cluster = *p_cluster + coff;
 		if (num_clusters)
 			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+		if (extent_flags)
+			*extent_flags = rec->e_flags;
 	}
 out:
 	if (eb_bh)
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 9942f47..e79d41c 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -55,7 +55,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 			     u32 *p_cluster, u32 *num_clusters,
-			     struct ocfs2_extent_list *el);
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags);
 
 int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
 			   struct buffer_head *bhs[], int flags,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 1bf12c4..dda49c0 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -704,7 +704,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	while (trunc_len) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
 					       &alloc_size,
-					       &vb->vb_xv->xr_list);
+					       &vb->vb_xv->xr_list, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -959,7 +959,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
 	cpos = 0;
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
-					       &num_clusters, el);
+					       &num_clusters, el, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1198,7 +1198,8 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
-					       &num_clusters, &xv->xr_list);
+					       &num_clusters, &xv->xr_list,
+					       NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 24/41] ocfs2: Abstract duplicate clusters process in CoW.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (22 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 23/41] ocfs2: Return extent flags for xattr value tree Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 25/41] ocfs2: Add CoW support for xattr Tao Ma
                   ` (17 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

We currently use pagecache to duplicate clusters in CoW,
but it isn't suitable for xattr case. So abstract it out
so that the caller can decide which method it use.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |  127 ++++++++++++++++++++++++++---------------------
 1 files changed, 71 insertions(+), 56 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index efcccb4..6029f52 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -45,12 +45,20 @@ struct ocfs2_cow_context {
 	struct inode *inode;
 	u32 cow_start;
 	u32 cow_len;
-	struct ocfs2_extent_tree di_et;
-	struct ocfs2_caching_info *ref_ci;
+	struct ocfs2_extent_tree data_et;
+	struct ocfs2_refcount_tree *ref_tree;
 	struct buffer_head *ref_root_bh;
 	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_alloc_context *data_ac;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
+	int (*get_clusters)(struct ocfs2_cow_context *context,
+			    u32 v_cluster, u32 *p_cluster,
+			    u32 *num_clusters,
+			    unsigned int *extent_flags);
+	int (*cow_duplicate_clusters)(handle_t *handle,
+				      struct ocfs2_cow_context *context,
+				      u32 cpos, u32 old_cluster,
+				      u32 new_cluster, u32 new_len);
 };
 
 static inline struct ocfs2_refcount_tree *
@@ -2443,7 +2451,7 @@ out:
  * use that value as the maximum clusters.
  */
 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
-					   struct buffer_head *di_bh,
+					   struct ocfs2_extent_list *el,
 					   u32 cpos,
 					   u32 write_len,
 					   u32 max_cpos,
@@ -2451,8 +2459,6 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 					   u32 *cow_len)
 {
 	int ret = 0;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-	struct ocfs2_extent_list *el = &di->id2.i_list;
 	int tree_height = le16_to_cpu(el->l_tree_depth), i;
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_extent_block *eb = NULL;
@@ -2675,13 +2681,13 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int ocfs2_duplicate_clusters(handle_t *handle,
-				    struct ocfs2_cow_context *context,
-				    u32 cpos, u32 old_cluster,
-				    u32 new_cluster, u32 new_len)
+static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+					    struct ocfs2_cow_context *context,
+					    u32 cpos, u32 old_cluster,
+					    u32 new_cluster, u32 new_len)
 {
 	int ret = 0, partial;
-	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
 	struct page *page;
@@ -2815,7 +2821,7 @@ static int ocfs2_replace_clusters(handle_t *handle,
 				  unsigned int ext_flags)
 {
 	int ret;
-	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	u64 ino = ocfs2_metadata_cache_owner(ci);
 
 	mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
@@ -2823,15 +2829,15 @@ static int ocfs2_replace_clusters(handle_t *handle,
 
 	/*If the old clusters is unwritten, no need to duplicate. */
 	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-		ret = ocfs2_duplicate_clusters(handle, context, cpos,
-					       old, new, len);
+		ret = context->cow_duplicate_clusters(handle, context, cpos,
+						      old, new, len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
-	ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+	ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
 				       cpos, new, len, ext_flags,
 				       context->meta_ac, &context->dealloc);
 	if (ret)
@@ -2889,6 +2895,15 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
 	return ret;
 }
 
+static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
+				 u32 v_cluster, u32 *p_cluster,
+				 u32 *num_clusters,
+				 unsigned int *extent_flags)
+{
+	return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
+				  num_clusters, extent_flags);
+}
+
 static int ocfs2_make_clusters_writable(struct super_block *sb,
 					struct ocfs2_cow_context *context,
 					u32 cpos, u32 p_cluster,
@@ -2900,14 +2915,15 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	handle_t *handle;
 	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
 	struct ocfs2_refcount_rec rec;
 
 	mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
 	     cpos, p_cluster, num_clusters, e_flags);
 
 	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
-					     &context->di_et,
-					     context->ref_ci,
+					     &context->data_et,
+					     ref_ci,
 					     context->ref_root_bh,
 					     &context->meta_ac,
 					     &context->data_ac, &credits);
@@ -2924,8 +2940,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	}
 
 	while (num_clusters) {
-		ret = ocfs2_get_refcount_rec(context->ref_ci,
-					     context->ref_root_bh,
+		ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
 					     p_cluster, num_clusters,
 					     &rec, &index, &ref_leaf_bh);
 		if (ret) {
@@ -2947,7 +2962,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 		 */
 		if (le32_to_cpu(rec.r_refcount) == 1) {
 			delete = 0;
-			ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+			ret = ocfs2_clear_ext_refcount(handle,
+						       &context->data_et,
 						       cpos, p_cluster,
 						       set_len, e_flags,
 						       context->meta_ac,
@@ -2978,7 +2994,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 			set_len = new_len;
 		}
 
-		ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
+		ret = __ocfs2_decrease_refcount(handle, ref_ci,
 						context->ref_root_bh,
 						p_cluster, set_len,
 						context->meta_ac,
@@ -3020,17 +3036,14 @@ out:
 	return ret;
 }
 
-static int ocfs2_replace_cow(struct inode *inode,
-			     struct buffer_head *di_bh,
-			     struct buffer_head *ref_root_bh,
-			     struct ocfs2_caching_info *ref_ci,
-			     u32 cow_start, u32 cow_len)
+static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
 {
 	int ret;
-	u32 p_cluster, num_clusters, start = cow_start;
+	struct inode *inode = context->inode;
+	u32 cow_start = context->cow_start, cow_len = context->cow_len;
+	u32 p_cluster, num_clusters;
 	unsigned int ext_flags;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_cow_context *context;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
@@ -3039,26 +3052,11 @@ static int ocfs2_replace_cow(struct inode *inode,
 		return -EROFS;
 	}
 
-	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
-	if (!context) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	context->inode = inode;
-	context->cow_start = cow_start;
-	context->cow_len = cow_len;
-	context->ref_ci = ref_ci;
-	context->ref_root_bh = ref_root_bh;
-
 	ocfs2_init_dealloc_ctxt(&context->dealloc);
-	ocfs2_init_dinode_extent_tree(&context->di_et,
-				      INODE_CACHE(inode), di_bh);
 
 	while (cow_len) {
-		ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
-					 &num_clusters, &ext_flags);
+		ret = context->get_clusters(context, cow_start, &p_cluster,
+					    &num_clusters, &ext_flags);
 
 		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
 
@@ -3077,20 +3075,11 @@ static int ocfs2_replace_cow(struct inode *inode,
 		cow_start += num_clusters;
 	}
 
-
-	/*
-	 * truncate the extent map here since no matter whether we meet with
-	 * any error during the action, we shouldn't trust cached extent map
-	 * any more.
-	 */
-	ocfs2_extent_map_trunc(inode, start);
-
 	if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
 		ocfs2_schedule_truncate_log_flush(osb, 1);
 		ocfs2_run_deallocs(osb, &context->dealloc);
 	}
 
-	kfree(context);
 	return ret;
 }
 
@@ -3105,10 +3094,11 @@ int ocfs2_refcount_cow(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_refcount_tree *ref_tree;
+	struct ocfs2_cow_context *context = NULL;
 
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 
-	ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh,
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
 					      cpos, write_len, max_cpos,
 					      &cow_start, &cow_len);
 	if (ret) {
@@ -3122,6 +3112,13 @@ int ocfs2_refcount_cow(struct inode *inode,
 
 	BUG_ON(cow_len == 0);
 
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
 				       1, &ref_tree, &ref_root_bh);
 	if (ret) {
@@ -3129,14 +3126,32 @@ int ocfs2_refcount_cow(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh, &ref_tree->rf_ci,
-				cow_start, cow_len);
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_tree = ref_tree;
+	context->ref_root_bh = ref_root_bh;
+	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
+	context->get_clusters = ocfs2_di_get_clusters;
+
+	ocfs2_init_dinode_extent_tree(&context->data_et,
+				      INODE_CACHE(inode), di_bh);
+
+	ret = ocfs2_replace_cow(context);
 	if (ret)
 		mlog_errno(ret);
 
+	/*
+	 * truncate the extent map here since no matter whether we meet with
+	 * any error during the action, we shouldn't trust cached extent map
+	 * any more.
+	 */
+	ocfs2_extent_map_trunc(inode, cow_start);
+
 	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
 out:
+	kfree(context);
 	return ret;
 }
 
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 25/41] ocfs2: Add CoW support for xattr.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (23 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 24/41] ocfs2: Abstract duplicate clusters process in CoW Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 26/41] ocfs2: Remove inode from ocfs2_xattr_bucket_get_name_value Tao Ma
                   ` (16 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In order to make 2 transcation(xattr and cow) independent with each other,
we CoW the whole xattr out in case we are setting them.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |  246 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/refcounttree.h |   29 ++++++
 fs/ocfs2/xattr.c        |  234 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 494 insertions(+), 15 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 6029f52..d6db689 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -32,6 +32,7 @@
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "aops.h"
+#include "xattr.h"
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -51,6 +52,9 @@ struct ocfs2_cow_context {
 	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_alloc_context *data_ac;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
+	void *cow_object;
+	struct ocfs2_post_refcount *post_refcount;
+	int extra_credits;
 	int (*get_clusters)(struct ocfs2_cow_context *context,
 			    u32 v_cluster, u32 *p_cluster,
 			    u32 *num_clusters,
@@ -2754,6 +2758,65 @@ unlock:
 	return ret;
 }
 
+static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+					   struct ocfs2_cow_context *context,
+					   u32 cpos, u32 old_cluster,
+					   u32 new_cluster, u32 new_len)
+{
+	int ret = 0;
+	struct super_block *sb = context->inode->i_sb;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
+	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
+	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *old_bh = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
+	     new_cluster, new_len);
+
+	for (i = 0; i < blocks; i++, old_block++, new_block++) {
+		new_bh = sb_getblk(osb->sb, new_block);
+		if (new_bh == NULL) {
+			ret = -EIO;
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+		ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_journal_access(handle, ci, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
+		ret = ocfs2_journal_dirty(handle, new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		brelse(new_bh);
+		brelse(old_bh);
+		new_bh = NULL;
+		old_bh = NULL;
+	}
+
+	brelse(new_bh);
+	brelse(old_bh);
+	return ret;
+}
+
 static int ocfs2_clear_ext_refcount(handle_t *handle,
 				    struct ocfs2_extent_tree *et,
 				    u32 cpos, u32 p_cluster, u32 len,
@@ -2932,6 +2995,10 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 		return ret;
 	}
 
+	if (context->post_refcount)
+		credits += context->post_refcount->credits;
+
+	credits += context->extra_credits;
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -3011,13 +3078,25 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 		ref_leaf_bh = NULL;
 	}
 
+	/* handle any post_cow action. */
+	if (context->post_refcount && context->post_refcount->func) {
+		ret = context->post_refcount->func(context->inode, handle,
+						context->post_refcount->para);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
 	/*
 	 * Here we should write the new page out first if we are
 	 * in write-back mode.
 	 */
-	ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
-	if (ret)
-		mlog_errno(ret);
+	if (context->get_clusters == ocfs2_di_get_clusters) {
+		ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+		if (ret)
+			mlog_errno(ret);
+	}
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
@@ -3155,6 +3234,167 @@ out:
 	return ret;
 }
 
+static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
+					  u32 v_cluster, u32 *p_cluster,
+					  u32 *num_clusters,
+					  unsigned int *extent_flags)
+{
+	struct inode *inode = context->inode;
+	struct ocfs2_xattr_value_root *xv = context->cow_object;
+
+	return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
+					num_clusters, &xv->xr_list,
+					extent_flags);
+}
+
+/*
+ * Given a xattr value root, calculate the most meta/credits we need for
+ * refcount tree change if we truncate it to 0.
+ */
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits)
+{
+	int ret = 0, index, ref_blocks = 0;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list,
+					       NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		cpos += num_clusters;
+
+		while (num_clusters) {
+			ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
+						     p_cluster, num_clusters,
+						     &rec, &index,
+						     &ref_leaf_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!rec.r_refcount);
+
+			rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+			/*
+			 * We really don't know whether the other clusters is in
+			 * this refcount block or not, so just take the worst
+			 * case that all the clusters are in this block and each
+			 * one will split a refcount rec, so totally we need
+			 * clusters * 2 new refcount rec.
+			 */
+			if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+			    le16_to_cpu(rb->rf_records.rl_count))
+				ref_blocks++;
+
+			*credits += 1;
+			brelse(ref_leaf_bh);
+			ref_leaf_bh = NULL;
+
+			if (num_clusters <= le32_to_cpu(rec.r_clusters))
+				break;
+			else
+				num_clusters -= le32_to_cpu(rec.r_clusters);
+			p_cluster += num_clusters;
+		}
+	}
+
+	*meta_add += ref_blocks;
+	if (!ref_blocks)
+		goto out;
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	else {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
+		*credits += ocfs2_calc_extend_credits(inode->i_sb,
+						      et.et_root_el,
+						      ref_blocks);
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+/*
+ * Do CoW for xattr.
+ */
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_refcount_tree *ref_tree,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post)
+{
+	int ret;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_cow_context *context = NULL;
+	u32 cow_start, cow_len;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
+					      cpos, write_len, UINT_MAX,
+					      &cow_start, &cow_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(cow_len == 0);
+
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_tree = ref_tree;
+	context->ref_root_bh = ref_root_bh;;
+	context->cow_object = xv;
+
+	context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
+	/* We need the extra credits for duplicate_clusters by jbd. */
+	context->extra_credits =
+		ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
+	context->get_clusters = ocfs2_xattr_value_get_clusters;
+	context->post_refcount = post;
+
+	ocfs2_init_xattr_value_extent_tree(&context->data_et,
+					   INODE_CACHE(inode), vb);
+
+	ret = ocfs2_replace_cow(context);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	kfree(context);
+	return ret;
+}
+
 /*
  * Insert a new extent into refcount tree and mark a extent rec
  * as refcounted in the dinode tree.
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 5539211..cd62eeb 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -54,4 +54,33 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
 		       u32 cpos, u32 write_len, u32 max_cpos);
+
+typedef int (ocfs2_post_refcount_func)(struct inode *inode,
+				       handle_t *handle,
+				       void *para);
+/*
+ * Some refcount caller need to do more work after we modify the data b-tree
+ * during refcount operation(including CoW and add refcount flag), and make the
+ * transaction complete. So it must give us this structure so that we can do it
+ * within our transaction.
+ *
+ */
+struct ocfs2_post_refcount {
+	int credits;			/* credits it need for journal. */
+	ocfs2_post_refcount_func *func;	/* real function. */
+	void *para;
+};
+
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits);
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_refcount_tree *ref_tree,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dda49c0..a538ceb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -55,7 +55,7 @@
 #include "buffer_head_io.h"
 #include "super.h"
 #include "xattr.h"
-
+#include "refcounttree.h"
 
 struct ocfs2_xattr_def_value_root {
 	struct ocfs2_xattr_value_root	xv;
@@ -176,6 +176,14 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
 				  u64 src_blk, u64 last_blk, u64 to_blk,
 				  unsigned int start_bucket,
 				  u32 *first_hash);
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_need,
+					int *credits);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -647,6 +655,7 @@ leave:
 static int __ocfs2_remove_xattr_range(struct inode *inode,
 				      struct ocfs2_xattr_value_buf *vb,
 				      u32 cpos, u32 phys_cpos, u32 len,
+				      unsigned int ext_flags,
 				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
@@ -678,7 +687,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
+	if (ext_flags & OCFS2_EXT_REFCOUNTED)
+		ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(inode->i_sb,
+								 phys_blkno),
+					len, ctxt->meta_ac, &ctxt->dealloc, 1);
+	else
+		ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
+						  phys_blkno, len);
 	if (ret)
 		mlog_errno(ret);
 
@@ -693,6 +709,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 				   struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret = 0;
+	unsigned int ext_flags;
 	u32 trunc_len, cpos, phys_cpos, alloc_size;
 	u64 block;
 
@@ -704,7 +721,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	while (trunc_len) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
 					       &alloc_size,
-					       &vb->vb_xv->xr_list, NULL);
+					       &vb->vb_xv->xr_list, &ext_flags);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -715,7 +732,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 
 		ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
 						 phys_cpos, alloc_size,
-						 ctxt);
+						 ext_flags, ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1182,7 +1199,7 @@ static int ocfs2_xattr_get(struct inode *inode,
 
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 					   handle_t *handle,
-					   struct ocfs2_xattr_value_root *xv,
+					   struct ocfs2_xattr_value_buf *vb,
 					   const void *value,
 					   int value_len)
 {
@@ -1193,18 +1210,22 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
 	u64 blkno;
 	struct buffer_head *bh = NULL;
+	unsigned int ext_flags;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
 
 	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
 
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
 					       &num_clusters, &xv->xr_list,
-					       NULL);
+					       &ext_flags);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
+		BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
@@ -1356,7 +1377,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
 					      xi->value, xi->value_len);
 	if (ret < 0)
 		mlog_errno(ret);
@@ -1595,7 +1616,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 
 				ret = __ocfs2_xattr_set_value_outside(inode,
 								handle,
-								vb.vb_xv,
+								&vb,
 								xi->value,
 								xi->value_len);
 				if (ret < 0)
@@ -2431,6 +2452,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 				     struct ocfs2_xattr_search *xis,
 				     struct ocfs2_xattr_search *xbs,
 				     struct ocfs2_xattr_set_ctxt *ctxt,
+				     int extra_meta,
 				     int *credits)
 {
 	int clusters_add, meta_add, ret;
@@ -2447,6 +2469,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 		return ret;
 	}
 
+	meta_add += extra_meta;
 	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
 	     "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
 
@@ -2714,10 +2737,11 @@ int ocfs2_xattr_set(struct inode *inode,
 {
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
-	int ret, credits;
+	int ret, credits, ref_meta = 0, ref_credits = 0;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	struct ocfs2_xattr_info xi = {
 		.name_index = name_index,
@@ -2782,6 +2806,17 @@ int ocfs2_xattr_set(struct inode *inode,
 			goto cleanup;
 	}
 
+	/* Check whether the value is refcounted and do some prepartion. */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+	    (!xis.not_found || !xbs.not_found)) {
+		ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
+						   &xis, &xbs, &ref_tree,
+						   &ref_meta, &ref_credits);
+		if (ret) {
+			mlog_errno(ret);
+			goto cleanup;
+		}
+	}
 
 	mutex_lock(&tl_inode->i_mutex);
 
@@ -2796,7 +2831,7 @@ int ocfs2_xattr_set(struct inode *inode,
 	mutex_unlock(&tl_inode->i_mutex);
 
 	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
-					&xbs, &ctxt, &credits);
+					&xbs, &ctxt, ref_meta, &credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto cleanup;
@@ -2804,7 +2839,7 @@ int ocfs2_xattr_set(struct inode *inode,
 
 	/* we need to update inode's ctime field, so add credit for it. */
 	credits += OCFS2_INODE_UPDATE_CREDITS;
-	ctxt.handle = ocfs2_start_trans(osb, credits);
+	ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
 	if (IS_ERR(ctxt.handle)) {
 		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
@@ -2823,6 +2858,8 @@ int ocfs2_xattr_set(struct inode *inode,
 		ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 	ocfs2_inode_unlock(inode, 1);
 cleanup_nolock:
@@ -4802,6 +4839,9 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 	struct ocfs2_xattr_entry *xe = xs->here;
 	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
 	void *base;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
 
 	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
 
@@ -4818,8 +4858,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 	xv = (struct ocfs2_xattr_value_root *)(base + offset +
 		 OCFS2_XATTR_SIZE(xe->xe_name_len));
 
+	vb.vb_xv = xv;
+	vb.vb_bh = xs->bucket->bu_bhs[block_off];
 	ret = __ocfs2_xattr_set_value_outside(inode, handle,
-					      xv, val, value_len);
+					      &vb, val, value_len);
 	if (ret)
 		mlog_errno(ret);
 out:
@@ -5311,6 +5353,174 @@ out:
 }
 
 /*
+ * Whenever we modify a xattr value root in the bucket(e.g, CoW
+ * or change the extent record flag), we need to recalculate
+ * the metaecc for the whole bucket. So it is done here.
+ *
+ * Note:
+ * We have to give the extra credits for the caller.
+ */
+static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
+					    handle_t *handle,
+					    void *para)
+{
+	int ret;
+	struct ocfs2_xattr_bucket *bucket =
+			(struct ocfs2_xattr_bucket *)para;
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+	return 0;
+}
+
+/*
+ * Special action we need if the xattr value is refcounted.
+ *
+ * 1. If the xattr is refcounted, lock the tree.
+ * 2. CoW the xattr if we are setting the new value and the value
+ *    will be stored outside.
+ * 3. In other case, decrease_refcount will work for us, so just
+ *    lock the refcount tree, calculate the meta and credits is OK.
+ *
+ * We have to do CoW before ocfs2_init_xattr_set_ctxt since
+ * currently CoW is a completed transaction, while this function
+ * will also lock the allocators and let us deadlock. So we will
+ * CoW the whole xattr value.
+ */
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_add,
+					int *credits)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_entry *xe;
+	char *base;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+	int name_offset, name_len;
+	struct ocfs2_xattr_value_buf vb;
+	struct ocfs2_xattr_bucket *bucket = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_post_refcount refcount;
+	struct ocfs2_post_refcount *p = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+		vb.vb_bh = xis->inode_bh;
+		vb.vb_access = ocfs2_journal_access_di;
+	} else {
+		int i, block_off = 0;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			base = bucket_block(xbs->bucket, block_off);
+			vb.vb_bh = xbs->bucket->bu_bhs[block_off];
+			vb.vb_access = ocfs2_journal_access;
+
+			if (ocfs2_meta_ecc(osb)) {
+				/*create parameters for ocfs2_post_refcount. */
+				bucket = xbs->bucket;
+				refcount.credits = bucket->bu_blocks;
+				refcount.para = bucket;
+				refcount.func =
+					ocfs2_xattr_bucket_post_refcount;
+				p = &refcount;
+			}
+		} else {
+			base = xbs->base;
+			vb.vb_bh = xbs->xattr_bh;
+			vb.vb_access = ocfs2_journal_access_xb;
+		}
+	}
+
+	if (ocfs2_xattr_is_local(xe))
+		goto out;
+
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+				(base + name_offset + name_len);
+
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters, &vb.vb_xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We just need to check the 1st extent record, since we always
+	 * CoW the whole xattr. So there shouldn't be a xattr with
+	 * some REFCOUNT extent recs after the 1st one.
+	 */
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * If we are deleting the xattr or the new size will be stored inside,
+	 * cool, leave it there, the xattr truncate process will remove them
+	 * for us(it still needs the refcount tree lock and the meta, credits).
+	 * And the worse case is that every cluster truncate will split the
+	 * refcount tree, and make the original extent become 3. So we will need
+	 * 2 * cluster more extent recs at most.
+	 */
+	if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
+
+		ret = ocfs2_refcounted_xattr_delete_need(inode,
+							 &(*ref_tree)->rf_ci,
+							 ref_root_bh, vb.vb_xv,
+							 meta_add, credits);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
+				       *ref_tree, ref_root_bh, 0,
+				       le32_to_cpu(vb.vb_xv->xr_clusters), p);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/*
  * 'security' attributes support
  */
 static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 26/41] ocfs2: Remove inode from ocfs2_xattr_bucket_get_name_value.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (24 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 25/41] ocfs2: Add CoW support for xattr Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 27/41] ocfs2: Abstract the creation of xattr block Tao Ma
                   ` (15 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In ocfs2_xattr_bucket_get_name_value, actually we only use
super_block. So use it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/xattr.c |   20 ++++++++++----------
 1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a538ceb..eeb5b7c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -140,7 +140,7 @@ struct ocfs2_xattr_search {
 	int not_found;
 };
 
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
 					     struct ocfs2_xattr_header *xh,
 					     int index,
 					     int *block_off,
@@ -1101,7 +1101,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 		i = xs->here - xs->header->xh_entries;
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 								bucket_xh(xs->bucket),
 								i,
 								&block_off,
@@ -2297,7 +2297,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		old_in_xb = 1;
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 							bucket_xh(xbs->bucket),
 							i, &block_off,
 							&name_offset);
@@ -2972,7 +2972,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 		if (cmp)
 			continue;
 
-		ret = ocfs2_xattr_bucket_get_name_value(inode,
+		ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 							xh,
 							i,
 							&block_off,
@@ -3216,7 +3216,7 @@ struct ocfs2_xattr_tree_list {
 	size_t result;
 };
 
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
 					     struct ocfs2_xattr_header *xh,
 					     int index,
 					     int *block_off,
@@ -3229,8 +3229,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
 
 	name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
 
-	*block_off = name_offset >> inode->i_sb->s_blocksize_bits;
-	*new_offset = name_offset % inode->i_sb->s_blocksize;
+	*block_off = name_offset >> sb->s_blocksize_bits;
+	*new_offset = name_offset % sb->s_blocksize;
 
 	return 0;
 }
@@ -3250,7 +3250,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 		prefix = ocfs2_xattr_prefix(type);
 
 		if (prefix) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 								bucket_xh(bucket),
 								i,
 								&block_off,
@@ -4845,7 +4845,7 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 
 	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
 
-	ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
+	ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
 						xe - xh->xh_entries,
 						&block_off,
 						&offset);
@@ -5433,7 +5433,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
 		i = xbs->here - xbs->header->xh_entries;
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 							bucket_xh(xbs->bucket),
 							i, &block_off,
 							&name_offset);
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 27/41] ocfs2: Abstract the creation of xattr block.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (25 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 26/41] ocfs2: Remove inode from ocfs2_xattr_bucket_get_name_value Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-21  1:22   ` Joel Becker
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 28/41] ocfs2: Abstract ocfs2 xattr tree extend rec iteration process Tao Ma
                   ` (14 subsequent siblings)
  41 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In xattr reflink, we also need to create xattr block, so
abstract the process out.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/xattr.c |  115 +++++++++++++++++++++++++++++++++---------------------
 1 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index eeb5b7c..a9339eb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2105,6 +2105,72 @@ cleanup:
 	return ret;
 }
 
+static int ocfs2_create_xattr_block(handle_t *handle,
+				    struct inode *inode,
+				    struct buffer_head *inode_bh,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct buffer_head **ret_bh)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_xattr_block *xblk;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &first_blkno);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
+				      new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	/* Initialize ocfs2_xattr_block */
+	xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+	memset(xblk, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+	xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+	xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+	xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+	ret = ocfs2_journal_dirty(handle, new_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+	di->i_xattr_loc = cpu_to_le64(first_blkno);
+	ocfs2_journal_dirty(handle, inode_bh);
+
+	*ret_bh = new_bh;
+	new_bh = NULL;
+
+end:
+	brelse(new_bh);
+	return ret;
+}
+
 /*
  * ocfs2_xattr_block_set()
  *
@@ -2117,65 +2183,24 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct buffer_head *new_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	handle_t *handle = ctxt->handle;
 	struct ocfs2_xattr_block *xblk = NULL;
-	u16 suballoc_bit_start;
-	u32 num_got;
-	u64 first_blkno;
 	int ret;
 
 	if (!xs->xattr_bh) {
-		ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
-					      xs->inode_bh,
-					      OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto end;
-		}
-
-		ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
-					   &suballoc_bit_start, &num_got,
-					   &first_blkno);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto end;
-		}
-
-		new_bh = sb_getblk(inode->i_sb, first_blkno);
-		ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
-
-		ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
-					      new_bh,
-					      OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret < 0) {
+		ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
+					       ctxt->meta_ac, &new_bh);
+		if (ret) {
 			mlog_errno(ret);
 			goto end;
 		}
 
-		/* Initialize ocfs2_xattr_block */
 		xs->xattr_bh = new_bh;
-		xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
-		memset(xblk, 0, inode->i_sb->s_blocksize);
-		strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
-		xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
-		xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-		xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
-		xblk->xb_blkno = cpu_to_le64(first_blkno);
-
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 		xs->header = &xblk->xb_attrs.xb_header;
 		xs->base = (void *)xs->header;
 		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
 		xs->here = xs->header->xh_entries;
-
-		ret = ocfs2_journal_dirty(handle, new_bh);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto end;
-		}
-		di->i_xattr_loc = cpu_to_le64(first_blkno);
-		ocfs2_journal_dirty(handle, xs->inode_bh);
 	} else
 		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 28/41] ocfs2: Abstract ocfs2 xattr tree extend rec iteration process.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (26 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 27/41] ocfs2: Abstract the creation of xattr block Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 29/41] ocfs2: Attach xattr clusters to refcount tree Tao Ma
                   ` (13 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Currently we have ocfs2_iterate_xattr_buckets which can receive
a para and a callback to iterate a series of bucket. It is good.
But actually the 2 callers ocfs2_xattr_tree_list_index_block and
ocfs2_delete_xattr_index_block are almost the same. The only
difference is that the latter need to handle the extent record
also. So add a new function named ocfs2_iterate_xattr_index_block.
It can be given func callback which are used for exten record.
So now we only have one iteration function for the xattr index
block. Ane what's more, it is useful for our future reflink
operations.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/xattr.c |  147 ++++++++++++++++++++++++++++--------------------------
 1 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a9339eb..bfa7ee2 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -157,7 +157,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 					struct ocfs2_xattr_search *xs);
 
 static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
-					struct ocfs2_xattr_tree_root *xt,
+					struct buffer_head *blk_bh,
 					char *buffer,
 					size_t buffer_size);
 
@@ -170,8 +170,23 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 					     struct ocfs2_xattr_search *xs,
 					     struct ocfs2_xattr_set_ctxt *ctxt);
 
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
-					  struct buffer_head *xb_bh);
+typedef int (xattr_tree_rec_func)(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno, u32 cpos, u32 len, void *para);
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *root_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para);
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para);
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len,
+				  void *para);
+
 static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
 				  u64 src_blk, u64 last_blk, u64 to_blk,
 				  unsigned int start_bucket,
@@ -870,11 +885,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
 		ret = ocfs2_xattr_list_entries(inode, header,
 					       buffer, buffer_size);
-	} else {
-		struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
-		ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+	} else
+		ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
 						   buffer, buffer_size);
-	}
 
 	brelse(blk_bh);
 
@@ -1801,7 +1814,10 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
 		ret = ocfs2_remove_value_outside(inode, &vb, header);
 	} else
-		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+		ret = ocfs2_iterate_xattr_index_block(inode,
+						blk_bh,
+						ocfs2_rm_xattr_cluster,
+						NULL);
 
 	return ret;
 }
@@ -3298,22 +3314,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 	return ret;
 }
 
-static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
-					     struct ocfs2_xattr_tree_root *xt,
-					     char *buffer,
-					     size_t buffer_size)
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *blk_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para)
 {
-	struct ocfs2_extent_list *el = &xt->xt_list;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
 	int ret = 0;
 	u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
 	u64 p_blkno = 0;
-	struct ocfs2_xattr_tree_list xl = {
-		.buffer = buffer,
-		.buffer_size = buffer_size,
-		.result = 0,
-	};
 
-	if (le16_to_cpu(el->l_next_free_rec) == 0)
+	if (!el->l_next_free_rec || !rec_func)
 		return 0;
 
 	while (name_hash > 0) {
@@ -3321,16 +3334,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 					  &e_cpos, &num_clusters, el);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			break;
 		}
 
-		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
-						  ocfs2_list_xattr_bucket,
-						  &xl);
+		ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
+			       num_clusters, para);
 		if (ret) {
 			if (ret != -ERANGE)
 				mlog_errno(ret);
-			goto out;
+			break;
 		}
 
 		if (e_cpos == 0)
@@ -3339,6 +3351,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 		name_hash = e_cpos - 1;
 	}
 
+	return ret;
+
+}
+
+static int ocfs2_list_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_list_xattr_bucket, para);
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					     struct buffer_head *blk_bh,
+					     char *buffer,
+					     size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_xattr_tree_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+		.result = 0,
+	};
+
+	ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+					      ocfs2_list_xattr_tree_rec, &xl);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = xl.result;
 out:
 	return ret;
@@ -4897,7 +4940,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 				  struct buffer_head *root_bh,
 				  u64 blkno,
 				  u32 cpos,
-				  u32 len)
+				  u32 len,
+				  void *para)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4909,6 +4953,13 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct ocfs2_extent_tree et;
 
+	ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					  ocfs2_delete_xattr_in_bucket, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
 	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
@@ -5331,52 +5382,6 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 	return ret;
 }
 
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
-					  struct buffer_head *xb_bh)
-{
-	struct ocfs2_xattr_block *xb =
-			(struct ocfs2_xattr_block *)xb_bh->b_data;
-	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
-	int ret = 0;
-	u32 name_hash = UINT_MAX, e_cpos, num_clusters;
-	u64 p_blkno;
-
-	if (le16_to_cpu(el->l_next_free_rec) == 0)
-		return 0;
-
-	while (name_hash > 0) {
-		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
-					  &e_cpos, &num_clusters, el);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
-						  ocfs2_delete_xattr_in_bucket,
-						  NULL);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
-					     p_blkno, e_cpos, num_clusters);
-		if (ret) {
-			mlog_errno(ret);
-			break;
-		}
-
-		if (e_cpos == 0)
-			break;
-
-		name_hash = e_cpos - 1;
-	}
-
-out:
-	return ret;
-}
-
 /*
  * Whenever we modify a xattr value root in the bucket(e.g, CoW
  * or change the extent record flag), we need to recalculate
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 29/41] ocfs2: Attach xattr clusters to refcount tree.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (27 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 28/41] ocfs2: Abstract ocfs2 xattr tree extend rec iteration process Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 30/41] ocfs2: Call refcount tree remove process properly Tao Ma
                   ` (12 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In ocfs2, when xattr's value is larger than OCFS2_XATTR_INLINE_SIZE,
it will be kept outside of the blocks we store xattr entry. And they
are stored in a b-tree also. So this patch try to attach all these
clusters to refcount tree also.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |   29 ++++-
 fs/ocfs2/refcounttree.h |    7 +
 fs/ocfs2/xattr.c        |  291 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h        |    6 +-
 4 files changed, 329 insertions(+), 4 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d6db689..9638e46 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3404,7 +3404,8 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 			    struct ocfs2_caching_info *ref_ci,
 			    struct buffer_head *ref_root_bh,
 			    u32 cpos, u32 p_cluster, u32 num_clusters,
-			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post)
 {
 	int ret;
 	handle_t *handle;
@@ -3433,6 +3434,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 		}
 	}
 
+	if (post)
+		credits += post->credits;
+
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -3451,8 +3455,16 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
 					p_cluster, num_clusters,
 					meta_ac, dealloc);
-	if (ret)
+	if (ret) {
 		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (post && post->func) {
+		ret = post->func(inode, handle, post->para);
+		if (ret)
+			mlog_errno(ret);
+	}
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
@@ -3545,7 +3557,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 						      &ref_tree->rf_ci,
 						      ref_root_bh, cpos,
 						      p_cluster, num_clusters,
-						      &dealloc);
+						      &dealloc, NULL);
 			if (ret) {
 				mlog_errno(ret);
 				goto unlock;
@@ -3556,6 +3568,17 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 		cpos += num_clusters;
 	}
 
+	if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
+						       &ref_tree->rf_ci,
+						       ref_root_bh,
+						       &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto unlock;
+		}
+	}
+
 	if (data_changed) {
 		ret = ocfs2_change_ctime(inode, di_bh);
 		if (ret)
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index cd62eeb..7796d1a 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -83,4 +83,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
 			     struct buffer_head *ref_root_bh,
 			     u32 cpos, u32 write_len,
 			     struct ocfs2_post_refcount *post);
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index bfa7ee2..9c3be01 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5551,6 +5551,297 @@ out:
 }
 
 /*
+ * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
+ * The physical clusters will be added to refcount tree.
+ */
+static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
+				struct ocfs2_xattr_value_root *xv,
+				struct ocfs2_extent_tree *value_et,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_post_refcount *refcount)
+{
+	int ret = 0;
+	u32 clusters = le32_to_cpu(xv->xr_clusters);
+	u32 cpos, p_cluster, num_clusters;
+	struct ocfs2_extent_list *el = &xv->xr_list;
+	unsigned int ext_flags;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el, &ext_flags);
+
+		cpos += num_clusters;
+		if ((ext_flags & OCFS2_EXT_REFCOUNTED))
+			continue;
+
+		BUG_ON(!p_cluster);
+
+		ret = ocfs2_add_refcount_flag(inode, value_et,
+					      ref_ci, ref_root_bh,
+					      cpos - num_clusters,
+					      p_cluster, num_clusters,
+					      dealloc, refcount);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Given a normal ocfs2_xattr_header, refcount all the entries which
+ * have value stored outside.
+ * Used for xattrs stored in inode and ocfs2_xattr_block.
+ */
+static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
+				struct ocfs2_xattr_value_buf *vb,
+				struct ocfs2_xattr_header *header,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_extent_tree et;
+	int i, ret = 0;
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		xe = &header->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		xv = (struct ocfs2_xattr_value_root *)((void *)header +
+			le16_to_cpu(xe->xe_name_offset) +
+			OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+		vb->vb_xv = xv;
+		ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
+							ref_ci, ref_root_bh,
+							dealloc, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
+				struct buffer_head *fe_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
+				(fe_bh->b_data + inode->i_sb->s_blocksize -
+				le16_to_cpu(di->i_xattr_inline_size));
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = fe_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+						  ref_ci, ref_root_bh, dealloc);
+}
+
+struct ocfs2_xattr_tree_value_refcount_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh)
+{
+	int ret, block_off, name_offset;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+	void *base;
+
+	ret = ocfs2_xattr_bucket_get_name_value(sb,
+						bucket_xh(bucket),
+						offset,
+						&block_off,
+						&name_offset);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	base = bucket_block(bucket, block_off);
+
+	*xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
+			 OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (bh)
+		*bh = bucket->bu_bhs[block_off];
+out:
+	return ret;
+}
+
+/*
+ * For a given xattr bucket, refcount all the entries which
+ * have value stored outside.
+ */
+static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_xattr_tree_value_refcount_para *ref =
+			(struct ocfs2_xattr_tree_value_refcount_para *)para;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+	struct ocfs2_post_refcount refcount = {
+		.credits = bucket->bu_blocks,
+		.para = bucket,
+		.func = ocfs2_xattr_bucket_post_refcount,
+	};
+	struct ocfs2_post_refcount *p = NULL;
+
+	/* We only need post_refcount if we support metaecc. */
+	if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
+		p = &refcount;
+
+	mlog(0, "refcount bucket %llu, count = %u\n",
+	     (unsigned long long)bucket_blkno(bucket),
+	     le16_to_cpu(xh->xh_count));
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
+						      &vb.vb_xv, &vb.vb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_init_xattr_value_extent_tree(&et,
+						   INODE_CACHE(inode), &vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
+							&et, ref->ref_ci,
+							ref->ref_root_bh,
+							ref->dealloc, p);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+
+}
+
+static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_xattr_bucket_value_refcount,
+					   para);
+}
+
+static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
+				struct buffer_head *blk_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		struct ocfs2_xattr_value_buf vb = {
+			.vb_bh = blk_bh,
+			.vb_access = ocfs2_journal_access_xb,
+		};
+
+		ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+	} else {
+		struct ocfs2_xattr_tree_value_refcount_para para = {
+			.ref_ci = ref_ci,
+			.ref_root_bh = ref_root_bh,
+			.dealloc = dealloc,
+		};
+
+		ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+						ocfs2_refcount_xattr_tree_rec,
+						&para);
+	}
+
+	return ret;
+}
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
+						ref_root_bh, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+out:
+
+	return ret;
+}
+
+/*
  * 'security' attributes support
  */
 static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1ca7e9a..a3295d7 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -83,5 +83,9 @@ struct ocfs2_xattr_value_buf {
 	struct ocfs2_xattr_value_root	*vb_xv;
 };
 
-
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc);
 #endif /* OCFS2_XATTR_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 30/41] ocfs2: Call refcount tree remove process properly.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (28 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 29/41] ocfs2: Attach xattr clusters to refcount tree Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 31/41] ocfs2: Create an xattr indexed block if needed Tao Ma
                   ` (11 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Now with xattr refcount support, we need to check whether
we have xattr refcounted before we remove the refcount tree.

Now the mechanism is:
1) Check whether i_clusters == 0, if no, exit.
2) check whether we have i_xattr_loc in dinode. if yes, exit.
2) Check whether we have inline xattr stored outside, if yes, exit.
4) Remove the tree.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/file.c         |    2 ++
 fs/ocfs2/inode.c        |    7 +++++++
 fs/ocfs2/refcounttree.c |   36 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    3 +++
 fs/ocfs2/xattr.c        |   23 +++++++++++++++++++++++
 fs/ocfs2/xattr.h        |    2 ++
 6 files changed, 73 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1254d72..ccbcc82 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -531,6 +531,8 @@ bail_unlock_sem:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
 bail:
+	if (!status && OCFS2_I(inode)->ip_clusters == 0)
+		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index e82ceb3..0297fb8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -782,6 +783,12 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}
 
+	status = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
 	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
 				    orphan_dir_bh);
 	if (status < 0)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9638e46..aec8bf1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -925,6 +925,42 @@ out:
 }
 
 /*
+ * Try to remove refcount tree. The mechanism is:
+ * 1) Check whether i_clusters == 0, if no, exit.
+ * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
+ * 3) Check whether we have inline xattr stored outside, if yes, exit.
+ * 4) Remove the tree.
+ */
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&oi->ip_xattr_sem);
+	down_write(&oi->ip_alloc_sem);
+
+	if (oi->ip_clusters)
+		goto out;
+
+	if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
+		goto out;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
+	    ocfs2_has_inline_xattr_value_outside(inode, di))
+		goto out;
+
+	ret = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (ret)
+		mlog_errno(ret);
+out:
+	up_write(&oi->ip_alloc_sem);
+	up_write(&oi->ip_xattr_sem);
+	return 0;
+}
+
+/*
  * Given a cpos and len, try to find the refcount record which contains cpos.
  * 1. If cpos can be found in one refcount record, return the record.
  * 2. If cpos can't be found, return a fake record which start from cpos
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 7796d1a..e46406a 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -90,4 +90,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 			    u32 cpos, u32 p_cluster, u32 num_clusters,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc,
 			    struct ocfs2_post_refcount *post);
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9c3be01..7f9c3ac 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -840,6 +840,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
 	return result;
 }
 
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di)
+{
+	struct ocfs2_xattr_header *xh;
+	int i;
+
+	xh = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
+		if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
+			return 1;
+
+	return 0;
+}
+
 static int ocfs2_xattr_ibody_list(struct inode *inode,
 				  struct ocfs2_dinode *di,
 				  char *buffer,
@@ -2898,10 +2915,16 @@ int ocfs2_xattr_set(struct inode *inode,
 	if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
 		ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
 cleanup:
 	if (ref_tree)
 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	if (!value && !ret) {
+		ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
 	ocfs2_inode_unlock(inode, 1);
 cleanup_nolock:
 	brelse(di_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index a3295d7..e74703f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
 			   int, const char *, const void *, size_t, int,
 			   struct ocfs2_alloc_context *,
 			   struct ocfs2_alloc_context *);
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 int ocfs2_init_security_get(struct inode *, struct inode *,
 			    struct ocfs2_security_xattr_info *);
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 31/41] ocfs2: Create an xattr indexed block if needed.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (29 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 30/41] ocfs2: Call refcount tree remove process properly Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 32/41] ocfs2: Add reflink support for xattr Tao Ma
                   ` (10 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

With reflink, there is a need that we create a new xattr indexed
block from the very beginning. So add a new parameter for
ocfs2_create_xattr_block.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/xattr.c |   16 ++++++++++++++--
 1 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7f9c3ac..8621038 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2142,7 +2142,8 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 				    struct inode *inode,
 				    struct buffer_head *inode_bh,
 				    struct ocfs2_alloc_context *meta_ac,
-				    struct buffer_head **ret_bh)
+				    struct buffer_head **ret_bh,
+				    int indexed)
 {
 	int ret;
 	u16 suballoc_bit_start;
@@ -2188,6 +2189,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
 	xblk->xb_blkno = cpu_to_le64(first_blkno);
 
+	if (indexed) {
+		struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
+		xr->xt_clusters = cpu_to_le32(1);
+		xr->xt_last_eb_blk = 0;
+		xr->xt_list.l_tree_depth = 0;
+		xr->xt_list.l_count = cpu_to_le16(
+					ocfs2_xattr_recs_per_xb(inode->i_sb));
+		xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+		xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
+	}
+
 	ret = ocfs2_journal_dirty(handle, new_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2222,7 +2234,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 
 	if (!xs->xattr_bh) {
 		ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
-					       ctxt->meta_ac, &new_bh);
+					       ctxt->meta_ac, &new_bh, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto end;
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 32/41] ocfs2: Add reflink support for xattr.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (30 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 31/41] ocfs2: Create an xattr indexed block if needed Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 33/41] ocfs2: Modify removing xattr process for refcount Tao Ma
                   ` (9 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |   24 +-
 fs/ocfs2/refcounttree.h |    6 +
 fs/ocfs2/xattr.c        |  923 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h        |    4 +
 4 files changed, 945 insertions(+), 12 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index aec8bf1..b80fa18 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1892,12 +1892,12 @@ out:
 	return ret;
 }
 
-static int __ocfs2_increase_refcount(handle_t *handle,
-				     struct ocfs2_caching_info *ci,
-				     struct buffer_head *ref_root_bh,
-				     u64 cpos, u32 len,
-				     struct ocfs2_alloc_context *meta_ac,
-				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0, index;
 	struct buffer_head *ref_leaf_bh = NULL;
@@ -3488,9 +3488,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
-					p_cluster, num_clusters,
-					meta_ac, dealloc);
+	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+				      p_cluster, num_clusters,
+				      meta_ac, dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -3679,9 +3679,9 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
-					p_cluster, num_clusters,
-					meta_ac, dealloc);
+	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+				      p_cluster, num_clusters,
+				      meta_ac, dealloc);
 	if (ret)
 		mlog_errno(ret);
 
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index e46406a..7b769b0 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -93,4 +93,10 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
 int ocfs2_try_remove_refcount_tree(struct inode *inode,
 				   struct buffer_head *di_bh);
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8621038..c32ef2c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5877,6 +5877,929 @@ out:
 }
 
 /*
+ * Store the information we need in xattr reflink.
+ * old_bh and new_bh are inode bh for the old and new inode.
+ */
+struct ocfs2_xattr_reflink {
+	struct inode *old_inode;
+	struct inode *new_inode;
+	struct buffer_head *old_bh;
+	struct buffer_head *new_bh;
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+/*
+ * Given a xattr header and xe offset,
+ * return the proper xv and the corresponding bh.
+ * xattr in inode, block and xattr tree have different implementaions.
+ */
+typedef int (get_xattr_value_root)(struct super_block *sb,
+				   struct buffer_head *bh,
+				   struct ocfs2_xattr_header *xh,
+				   int offset,
+				   struct ocfs2_xattr_value_root **xv,
+				   struct buffer_head **ret_bh,
+				   void *para);
+
+/*
+ * Calculate all the xattr value root metadata stored in this xattr header and
+ * credits we need if we create them from the scratch.
+ * We use get_xattr_value_root so that all types of xattr container can use it.
+ */
+static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
+					     struct buffer_head *bh,
+					     struct ocfs2_xattr_header *xh,
+					     int *metas, int *credits,
+					     int *num_recs,
+					     get_xattr_value_root *func,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		*metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
+			  le16_to_cpu(xv->xr_list.l_next_free_rec);
+
+		*credits += ocfs2_calc_extend_credits(sb,
+						&def_xv.xv.xr_list,
+						le32_to_cpu(xv->xr_clusters));
+
+		/*
+		 * If the value is a tree with depth > 1, We don't go deep
+		 * to the extent block, so just calculate a maximum record num.
+		 */
+		if (!xv->xr_list.l_tree_depth)
+			*num_recs += xv->xr_list.l_next_free_rec;
+		else
+			*num_recs += ocfs2_clusters_for_bytes(sb,
+							      XATTR_SIZE_MAX);
+	}
+
+	return ret;
+}
+
+/* Used by xattr inode and block to return the right xv and buffer_head. */
+static int ocfs2_get_xattr_value_root(struct super_block *sb,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *xh,
+				      int offset,
+				      struct ocfs2_xattr_value_root **xv,
+				      struct buffer_head **ret_bh,
+				      void *para)
+{
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+
+	*xv = (struct ocfs2_xattr_value_root *)((void *)xh +
+		le16_to_cpu(xe->xe_name_offset) +
+		OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (ret_bh)
+		*ret_bh = bh;
+
+	return 0;
+}
+
+/*
+ * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * It is only used for inline xattr and xattr block.
+ */
+static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
+					struct ocfs2_xattr_header *xh,
+					struct buffer_head *ref_root_bh,
+					int *credits,
+					struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, meta_add = 0, num_recs = 0;
+	struct ocfs2_refcount_block *rb =
+			(struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	*credits = 0;
+
+	ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
+						&meta_add, credits, &num_recs,
+						ocfs2_get_xattr_value_root,
+						NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We need to add/modify num_recs in refcount tree, so just calculate
+	 * an approximate number we need for refcount tree change.
+	 * Sometimes we need to split the tree, and after split,  half recs
+	 * will be moved to the new block, and a new block can only provide
+	 * half number of recs. So we multiple new blocks by 2.
+	 */
+	num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+	meta_add += num_recs;
+	*credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+	else
+		*credits += 1;
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a xattr header, reflink all the xattrs in this container.
+ * It can be used for inode, block and bucket.
+ *
+ * NOTE:
+ * Before we call this function, the caller has memcpy the xattr in
+ * old_xh to the new_xh.
+ */
+static int ocfs2_reflink_xattr_header(handle_t *handle,
+				      struct ocfs2_xattr_reflink *args,
+				      struct buffer_head *old_bh,
+				      struct ocfs2_xattr_header *xh,
+				      struct buffer_head *new_bh,
+				      struct ocfs2_xattr_header *new_xh,
+				      struct ocfs2_xattr_value_buf *vb,
+				      struct ocfs2_alloc_context *meta_ac,
+				      get_xattr_value_root *func,
+				      void *para)
+{
+	int ret = 0, i;
+	struct super_block *sb = args->old_inode->i_sb;
+	struct buffer_head *value_bh;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_root *xv, *new_xv;
+	struct ocfs2_extent_tree data_et;
+	u32 clusters, cpos, p_cluster, num_clusters;
+	unsigned int ext_flags = 0;
+
+	mlog(0, "reflink xattr in container %llu, count = %u\n",
+	     (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, old_bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = func(sb, new_bh, new_xh, i, &new_xv, &value_bh, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * For the xattr which has l_tree_depth = 0, all the extent
+		 * recs have already be copied to the new xh with the
+		 * propriate OCFS2_EXT_REFCOUNTED flag we just need to
+		 * increase the refount count int the refcount tree.
+		 *
+		 * For the xattr which has l_tree_depth > 0, we need
+		 * to initialize it to the empty default value root,
+		 * and then insert the extents one by one.
+		 */
+		if (xv->xr_list.l_tree_depth) {
+			memcpy(new_xv, &def_xv, sizeof(def_xv));
+			vb->vb_xv = new_xv;
+			vb->vb_bh = value_bh;
+			ocfs2_init_xattr_value_extent_tree(&data_et,
+					INODE_CACHE(args->new_inode), vb);
+		}
+
+		clusters = le32_to_cpu(xv->xr_clusters);
+		cpos = 0;
+		while (cpos < clusters) {
+			ret = ocfs2_xattr_get_clusters(args->old_inode,
+						       cpos,
+						       &p_cluster,
+						       &num_clusters,
+						       &xv->xr_list,
+						       &ext_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!p_cluster);
+
+			if (xv->xr_list.l_tree_depth) {
+				ret = ocfs2_insert_extent(handle,
+						&data_et, cpos,
+						ocfs2_clusters_to_blocks(
+							args->old_inode->i_sb,
+							p_cluster),
+						num_clusters, ext_flags,
+						meta_ac);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+
+			ret = ocfs2_increase_refcount(handle, args->ref_ci,
+						      args->ref_root_bh,
+						      p_cluster, num_clusters,
+						      meta_ac, args->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cpos += num_clusters;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
+	int inline_size = le16_to_cpu(di->i_xattr_inline_size);
+	int header_off = osb->sb->s_blocksize - inline_size;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
+					(args->old_bh->b_data + header_off);
+	struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
+					(args->new_bh->b_data + header_off);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_inode_info *new_oi;
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = args->new_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
+				      args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(args->new_bh->b_data + header_off,
+	       args->old_bh->b_data + header_off, inline_size);
+
+	new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+	new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
+					 args->new_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_oi = OCFS2_I(args->new_inode);
+	spin_lock(&new_oi->ip_lock);
+	new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
+	new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+	spin_unlock(&new_oi->ip_lock);
+
+	ocfs2_journal_dirty(handle, args->new_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_create_empty_xattr_block(struct inode *inode,
+					  struct buffer_head *fe_bh,
+					  struct buffer_head **ret_bh,
+					  int indexed)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "create new xattr block for inode %llu, index = %d\n",
+	     (unsigned long long)fe_bh->b_blocknr, indexed);
+	ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
+				       meta_ac, ret_bh, indexed);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
+				     struct buffer_head *blk_bh,
+				     struct buffer_head *new_blk_bh)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
+	int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+	struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_block *new_xb =
+			(struct ocfs2_xattr_block *)new_blk_bh->b_data;
+	struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = new_blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* One more credits in case we need to add xattr flags in new inode. */
+	handle = ocfs2_start_trans(osb, credits + 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		ret = ocfs2_journal_access_di(handle,
+					      INODE_CACHE(args->new_inode),
+					      args->new_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
+				      new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
+	       osb->sb->s_blocksize - header_off);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
+					 new_blk_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_journal_dirty(handle, new_blk_bh);
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+		spin_lock(&new_oi->ip_lock);
+		new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+		new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+		spin_unlock(&new_oi->ip_lock);
+
+		ocfs2_journal_dirty(handle, args->new_bh);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+struct ocfs2_reflink_xattr_tree_args {
+	struct ocfs2_xattr_reflink *reflink;
+	struct buffer_head *old_blk_bh;
+	struct buffer_head *new_blk_bh;
+	struct ocfs2_xattr_bucket *old_bucket;
+	struct ocfs2_xattr_bucket *new_bucket;
+};
+
+/*
+ * NOTE:
+ * We have to handle the case that both old bucket and new bucket
+ * will call this function to get the right ret_bh.
+ * So The caller must give us the right bh.
+ */
+static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_reflink_xattr_tree_args *args =
+			(struct ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_xattr_bucket *bucket;
+
+	if (bh == args->old_bucket->bu_bhs[0])
+		bucket = args->old_bucket;
+	else
+		bucket = args->new_bucket;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+struct ocfs2_value_tree_metas {
+	int num_metas;
+	int credits;
+	int num_recs;
+};
+
+static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_xattr_bucket *bucket =
+				(struct ocfs2_xattr_bucket *)para;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+static int ocfs2_calc_value_tree_metas(struct inode *inode,
+				      struct ocfs2_xattr_bucket *bucket,
+				      void *para)
+{
+	struct ocfs2_value_tree_metas *metas =
+			(struct ocfs2_value_tree_metas *)para;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
+
+	/* Add the credits for this bucket first. */
+	metas->credits += bucket->bu_blocks;
+	return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
+					xh, &metas->num_metas,
+					&metas->credits, &metas->num_recs,
+					ocfs2_value_tree_metas_in_bucket,
+					bucket);
+}
+
+/*
+ * Given a xattr extent rec starting from blkno and having len clusters,
+ * iterate all the buckets calculate how much metadata we need for reflinking
+ * all the ocfs2_xattr_value_root and lock the allocators accordingly.
+ */
+static int ocfs2_lock_reflink_xattr_rec_allocators(
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *xt_et,
+				u64 blkno, u32 len, int *credits,
+				struct ocfs2_alloc_context **meta_ac,
+				struct ocfs2_alloc_context **data_ac)
+{
+	int ret, num_free_extents;
+	struct ocfs2_value_tree_metas metas;
+	struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+
+	memset(&metas, 0, sizeof(metas));
+
+	ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
+					  ocfs2_calc_value_tree_metas, &metas);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*credits = metas.credits;
+
+	/*
+	 * Calculate we need for refcount tree change.
+	 *
+	 * We need to add/modify num_recs in refcount tree, so just calculate
+	 * an approximate number we need for refcount tree change.
+	 * Sometimes we need to split the tree, and after split,  half recs
+	 * will be moved to the new block, and a new block can only provide
+	 * half number of recs. So we multiple new blocks by 2.
+	 * In the end, we have to add credits for modifying the already
+	 * existed refcount block.
+	 */
+	rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
+	metas.num_recs =
+		(metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
+		 ocfs2_refcount_recs_per_rb(osb->sb) * 2;
+	metas.num_metas += metas.num_recs;
+	*credits += metas.num_recs +
+		    metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+	else
+		*credits += 1;
+
+	/* count in the xattr tree change. */
+	num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < len)
+		metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(osb->sb,
+					      xt_et->et_root_el, len);
+
+	if (metas.num_metas) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
+							meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (len) {
+		ret = ocfs2_reserve_clusters(osb, len, data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+				u64 blkno, u64 new_blkno, u32 clusters,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_reflink_xattr_tree_args *args)
+{
+	int i, j, ret = 0;
+	struct super_block *sb = args->reflink->old_inode->i_sb;
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+	u32 num_buckets = clusters * bpc;
+	int bpb = args->old_bucket->bu_blocks;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+
+	for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
+		ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			num_buckets = le16_to_cpu(
+				bucket_xh(args->old_bucket)->xh_num_buckets);
+
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		for (j = 0; j < bpb; j++)
+			memcpy(bucket_block(args->new_bucket, j),
+			       bucket_block(args->old_bucket, j),
+			       sb->s_blocksize);
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+		ret = ocfs2_reflink_xattr_header(handle, args->reflink,
+					args->old_bucket->bu_bhs[0],
+					bucket_xh(args->old_bucket),
+					args->new_bucket->bu_bhs[0],
+					bucket_xh(args->new_bucket),
+					&vb, meta_ac,
+					ocfs2_get_reflink_xattr_value_root,
+					args);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * Re-access and dirty the bucket to calculate metaecc.
+		 * Because we may extend the transaction in reflink_xattr_header
+		 * which will let the already accessed block gone.
+		 */
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+		ocfs2_xattr_bucket_relse(args->old_bucket);
+		ocfs2_xattr_bucket_relse(args->new_bucket);
+	}
+
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+	ocfs2_xattr_bucket_relse(args->new_bucket);
+	return ret;
+}
+/*
+ * Create the same xattr extent record in the new inode's xattr tree.
+ */
+static int ocfs2_reflink_xattr_rec(struct inode *inode,
+				   struct buffer_head *root_bh,
+				   u64 blkno,
+				   u32 cpos,
+				   u32 len,
+				   void *para)
+{
+	int ret, credits = 0;
+	u32 p_cluster, num_clusters;
+	u64 new_blkno;
+	handle_t *handle;
+	struct ocfs2_reflink_xattr_tree_args *args =
+			(struct ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_tree_extent_tree(&et,
+					  INODE_CACHE(args->reflink->new_inode),
+					  args->new_blk_bh);
+
+	ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
+						      len, &credits,
+						      &meta_ac, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_clusters(osb, handle, data_ac,
+				   len, &p_cluster, &num_clusters);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
+
+	mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
+	     (unsigned long long)blkno, (unsigned long long)new_blkno, len);
+	ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
+					  meta_ac, data_ac, args);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+	     (unsigned long long)new_blkno, len, cpos);
+	ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
+				  len, 0, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	return ret;
+}
+
+/*
+ * Create reflinked xattr buckets.
+ * We will add bucket one by one, and refcount all the xattrs in the bucket
+ * if they are stored outside.
+ */
+static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
+				    struct buffer_head *blk_bh,
+				    struct buffer_head *new_blk_bh)
+{
+	int ret;
+	struct ocfs2_reflink_xattr_tree_args para;
+
+	memset(&para, 0, sizeof(para));
+	para.reflink = args;
+	para.old_blk_bh = blk_bh;
+	para.new_blk_bh = new_blk_bh;
+
+	para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
+	if (!para.old_bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
+	if (!para.new_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
+					      ocfs2_reflink_xattr_rec,
+					      &para);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_xattr_bucket_free(para.old_bucket);
+	ocfs2_xattr_bucket_free(para.new_bucket);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
+					struct buffer_head *blk_bh)
+{
+	int ret, indexed = 0;
+	struct buffer_head *new_blk_bh = NULL;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)blk_bh->b_data;
+
+
+	if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
+		indexed = 1;
+
+	ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
+					     &new_blk_bh, indexed);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+		ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
+	else
+		ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_blk_bh);
+	return ret;
+}
+
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh)
+{
+	int ret;
+	struct ocfs2_xattr_reflink args;
+	struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct buffer_head *ref_root_bh = NULL;
+
+	ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				       le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	args.old_inode = old_inode;
+	args.new_inode = new_inode;
+	args.old_bh = old_bh;
+	args.new_bh = new_bh;
+	args.ref_ci = &ref_tree->rf_ci;
+	args.ref_root_bh = ref_root_bh;
+	args.dealloc = &dealloc;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_reflink_xattr_inline(&args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out_unlock;
+
+	ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+
+out_unlock:
+	ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				   ref_tree, 1);
+	brelse(ref_root_bh);
+
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
+		ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
+	}
+
+out:
+	return ret;
+}
+
+/*
  * 'security' attributes support
  */
 static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index e74703f..4f91305 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -90,4 +90,8 @@ int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
 				     struct ocfs2_caching_info *ref_ci,
 				     struct buffer_head *ref_root_bh,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh);
 #endif /* OCFS2_XATTR_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 33/41] ocfs2: Modify removing xattr process for refcount.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (31 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 32/41] ocfs2: Add reflink support for xattr Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 34/41] ocfs2: Don't merge in 1st refcount ops of reflink Tao Ma
                   ` (8 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

The old xattr value remove is quite simple, it just erase the
tree and free the clusters. But as we have added refcount support,
The process is a little complicated.

We have to lock the refcount tree at the beginning, what's more,
we may split the refcount tree in some cases, so meta/credits are
needed.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/xattr.c |  190 +++++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 154 insertions(+), 36 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c32ef2c..4eef69b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -199,6 +199,11 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
 					struct ocfs2_refcount_tree **ref_tree,
 					int *meta_need,
 					int *credits);
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -1752,51 +1757,112 @@ out:
 	return ret;
 }
 
+/*
+ * In xattr remove, if it is stored outside and refcounted, we may have
+ * the chance to split the refcount tree. So need the allocators.
+ */
+static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
+					struct ocfs2_xattr_value_root *xv,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					int *ref_credits)
+{
+	int ret, meta_add = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	*ref_credits = 0;
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters,
+				       &xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
+						 ref_root_bh, xv,
+						 &meta_add, ref_credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+						meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
 static int ocfs2_remove_value_outside(struct inode*inode,
 				      struct ocfs2_xattr_value_buf *vb,
-				      struct ocfs2_xattr_header *header)
+				      struct ocfs2_xattr_header *header,
+				      struct ocfs2_caching_info *ref_ci,
+				      struct buffer_head *ref_root_bh)
 {
-	int ret = 0, i;
+	int ret = 0, i, ref_credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	void *val;
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
-	ctxt.handle = ocfs2_start_trans(osb,
-					ocfs2_remove_extent_credits(osb->sb));
-	if (IS_ERR(ctxt.handle)) {
-		ret = PTR_ERR(ctxt.handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 
-		if (!ocfs2_xattr_is_local(entry)) {
-			void *val;
+		if (ocfs2_xattr_is_local(entry))
+			continue;
 
-			val = (void *)header +
-				le16_to_cpu(entry->xe_name_offset);
-			vb->vb_xv = (struct ocfs2_xattr_value_root *)
-				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-			ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
-			if (ret < 0) {
-				mlog_errno(ret);
-				break;
-			}
+		val = (void *)header +
+			le16_to_cpu(entry->xe_name_offset);
+		vb->vb_xv = (struct ocfs2_xattr_value_root *)
+			(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
+							 ref_ci, ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, ref_credits +
+					ocfs2_remove_extent_credits(osb->sb));
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
 		}
 	}
 
-	ocfs2_commit_trans(osb, ctxt.handle);
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
-out:
 	return ret;
 }
 
 static int ocfs2_xattr_ibody_remove(struct inode *inode,
-				    struct buffer_head *di_bh)
+				    struct buffer_head *di_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
 {
 
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1811,13 +1877,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
 		 ((void *)di + inode->i_sb->s_blocksize -
 		 le16_to_cpu(di->i_xattr_inline_size));
 
-	ret = ocfs2_remove_value_outside(inode, &vb, header);
+	ret = ocfs2_remove_value_outside(inode, &vb, header,
+					 ref_ci, ref_root_bh);
 
 	return ret;
 }
 
+struct ocfs2_rm_xattr_bucket_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+};
+
 static int ocfs2_xattr_block_remove(struct inode *inode,
-				    struct buffer_head *blk_bh)
+				    struct buffer_head *blk_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
 {
 	struct ocfs2_xattr_block *xb;
 	int ret = 0;
@@ -1825,22 +1899,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 		.vb_bh = blk_bh,
 		.vb_access = ocfs2_journal_access_xb,
 	};
+	struct ocfs2_rm_xattr_bucket_para args = {
+		.ref_ci = ref_ci,
+		.ref_root_bh = ref_root_bh,
+	};
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-		ret = ocfs2_remove_value_outside(inode, &vb, header);
+		ret = ocfs2_remove_value_outside(inode, &vb, header,
+						 ref_ci, ref_root_bh);
 	} else
 		ret = ocfs2_iterate_xattr_index_block(inode,
 						blk_bh,
 						ocfs2_rm_xattr_cluster,
-						NULL);
+						&args);
 
 	return ret;
 }
 
 static int ocfs2_xattr_free_block(struct inode *inode,
-				  u64 block)
+				  u64 block,
+				  struct ocfs2_caching_info *ref_ci,
+				  struct buffer_head *ref_root_bh)
 {
 	struct inode *xb_alloc_inode;
 	struct buffer_head *xb_alloc_bh = NULL;
@@ -1858,7 +1939,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_xattr_block_remove(inode, blk_bh);
+	ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1918,6 +1999,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_caching_info *ref_ci = NULL;
 	handle_t *handle;
 	int ret;
 
@@ -1927,8 +2011,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		return 0;
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+		ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
+					       le64_to_cpu(di->i_refcount_loc),
+					       1, &ref_tree, &ref_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ref_ci = &ref_tree->rf_ci;
+
+	}
+
 	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
-		ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh,
+					       ref_ci, ref_root_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1937,7 +2034,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 
 	if (di->i_xattr_loc) {
 		ret = ocfs2_xattr_free_block(inode,
-					     le64_to_cpu(di->i_xattr_loc));
+					     le64_to_cpu(di->i_xattr_loc),
+					     ref_ci, ref_root_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1971,6 +2069,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
+	brelse(ref_root_bh);
 	return ret;
 }
 
@@ -4989,7 +5090,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	struct ocfs2_extent_tree et;
 
 	ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
-					  ocfs2_delete_xattr_in_bucket, NULL);
+					  ocfs2_delete_xattr_in_bucket, para);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -5378,7 +5479,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 					struct ocfs2_xattr_bucket *bucket,
 					void *para)
 {
-	int ret = 0;
+	int ret = 0, ref_credits;
 	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u16 i;
 	struct ocfs2_xattr_entry *xe;
@@ -5386,7 +5487,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 	struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
 	int credits = ocfs2_remove_extent_credits(osb->sb) +
 		ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_rm_xattr_bucket_para *args =
+			(struct ocfs2_rm_xattr_bucket_para *)para;
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
@@ -5395,7 +5498,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 		if (ocfs2_xattr_is_local(xe))
 			continue;
 
-		ctxt.handle = ocfs2_start_trans(osb, credits);
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
+						      i, &xv, NULL);
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
+							 args->ref_ci,
+							 args->ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
 		if (IS_ERR(ctxt.handle)) {
 			ret = PTR_ERR(ctxt.handle);
 			mlog_errno(ret);
@@ -5406,12 +5518,18 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 							i, 0, &ctxt);
 
 		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
+		}
 		if (ret) {
 			mlog_errno(ret);
 			break;
 		}
 	}
 
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 	return ret;
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 34/41] ocfs2: Don't merge in 1st refcount ops of reflink.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (32 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 33/41] ocfs2: Modify removing xattr process for refcount Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 35/41] ocfs2: Make transaction extend more efficient Tao Ma
                   ` (7 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Actually the whole reflink will touch refcount tree 2 times:
1. It will add the clusters in the extent record to the tree if it
   isn't refcounted before.
2. It will add 1 refcount to these clusters when it add these
   extent records to the tree.

So actually we shouldn't do merge in the 1st operation since the 2nd
one will soon be called and we may have to split it again. Do a merge
first and split soon is a waste of time. So we only merge in the 2nd
round. This is done by adding a new internal __ocfs2_increase_refcount
and call it with "not-merge" for 1st refcount operation in reflink.

This also has a side-effect that we don't need to worry too much about
the metadata allocation in the 2nd round since it will only merge and
no split will happen for those records.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |   56 ++++++++++++++++++++++++++++++----------------
 1 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b80fa18..4a16574 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1145,7 +1145,7 @@ static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
 static int ocfs2_change_refcount_rec(handle_t *handle,
 				     struct ocfs2_caching_info *ci,
 				     struct buffer_head *ref_leaf_bh,
-				     int index, int change)
+				     int index, int merge, int change)
 {
 	int ret;
 	struct ocfs2_refcount_block *rb =
@@ -1174,7 +1174,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
 		}
 
 		le16_add_cpu(&rl->rl_used, -1);
-	} else
+	} else if (merge)
 		ocfs2_refcount_rec_merge(rb, index);
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
@@ -1650,7 +1650,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
 				     struct buffer_head *ref_root_bh,
 				     struct buffer_head *ref_leaf_bh,
 				     struct ocfs2_refcount_rec *rec,
-				     int index,
+				     int index, int merge,
 				     struct ocfs2_alloc_context *meta_ac)
 {
 	int ret;
@@ -1708,7 +1708,8 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
 
 	le16_add_cpu(&rf_list->rl_used, 1);
 
-	ocfs2_refcount_rec_merge(rb, index);
+	if (merge)
+		ocfs2_refcount_rec_merge(rb, index);
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
 	if (ret) {
@@ -1742,7 +1743,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
 				    struct buffer_head *ref_root_bh,
 				    struct buffer_head *ref_leaf_bh,
 				    struct ocfs2_refcount_rec *split_rec,
-				    int index,
+				    int index, int merge,
 				    struct ocfs2_alloc_context *meta_ac,
 				    struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
@@ -1880,7 +1881,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
 		     le32_to_cpu(split_rec->r_refcount),
 		     (unsigned long long)ref_leaf_bh->b_blocknr, index);
 
-		ocfs2_refcount_rec_merge(rb, index);
+		if (merge)
+			ocfs2_refcount_rec_merge(rb, index);
 	}
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
@@ -1892,12 +1894,12 @@ out:
 	return ret;
 }
 
-int ocfs2_increase_refcount(handle_t *handle,
-			    struct ocfs2_caching_info *ci,
-			    struct buffer_head *ref_root_bh,
-			    u64 cpos, u32 len,
-			    struct ocfs2_alloc_context *meta_ac,
-			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+static int __ocfs2_increase_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len, int merge,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0, index;
 	struct buffer_head *ref_leaf_bh = NULL;
@@ -1935,7 +1937,8 @@ int ocfs2_increase_refcount(handle_t *handle,
 			     "count %u\n", (unsigned long long)cpos, set_len,
 			     le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_change_refcount_rec(handle, ci,
-							ref_leaf_bh, index, 1);
+							ref_leaf_bh, index,
+							merge, 1);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -1948,7 +1951,8 @@ int ocfs2_increase_refcount(handle_t *handle,
 			     set_len);
 			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
 							ref_leaf_bh,
-							&rec, index, meta_ac);
+							&rec, index,
+							merge, meta_ac);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -1966,7 +1970,7 @@ int ocfs2_increase_refcount(handle_t *handle,
 			     set_len, le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_split_refcount_rec(handle, ci,
 						       ref_root_bh, ref_leaf_bh,
-						       &rec, index,
+						       &rec, index, merge,
 						       meta_ac, dealloc);
 			if (ret) {
 				mlog_errno(ret);
@@ -2059,6 +2063,18 @@ out:
 	return ret;
 }
 
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
+					 cpos, len, 1,
+					 meta_ac, dealloc);
+}
+
 static int ocfs2_decrease_refcount_rec(handle_t *handle,
 				struct ocfs2_caching_info *ci,
 				struct buffer_head *ref_root_bh,
@@ -2079,7 +2095,7 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
 	if (cpos == le64_to_cpu(rec->r_cpos) &&
 	    len == le32_to_cpu(rec->r_clusters))
 		ret = ocfs2_change_refcount_rec(handle, ci,
-						ref_leaf_bh, index, -1);
+						ref_leaf_bh, index, 1, -1);
 	else {
 		struct ocfs2_refcount_rec split = *rec;
 		split.r_cpos = cpu_to_le64(cpos);
@@ -2095,7 +2111,7 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
 		     le32_to_cpu(rec->r_clusters));
 		ret = ocfs2_split_refcount_rec(handle, ci,
 					       ref_root_bh, ref_leaf_bh,
-					       &split, index,
+					       &split, index, 1,
 					       meta_ac, dealloc);
 	}
 
@@ -3488,9 +3504,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
-				      p_cluster, num_clusters,
-				      meta_ac, dealloc);
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters, 0,
+					meta_ac, dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 35/41] ocfs2: Make transaction extend more efficient.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (33 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 34/41] ocfs2: Don't merge in 1st refcount ops of reflink Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 36/41] ocfs2: Use proper parameter for some inode operation Tao Ma
                   ` (6 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In ocfs2_extend_rotate_transaction, op_credits is the orignal
credits in the handle and we only want to extend the credits
for the rotation, but the old solution always double it. It
is harmless for some minor operations, but for actions like
reflink we may rotate tree many times and cause the credits
increase dramatically. So this patch try to only increase
the desired credits.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7c879fc..38a42f5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2326,10 +2326,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
 					   int op_credits,
 					   struct ocfs2_path *path)
 {
+	int ret;
 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
 
-	if (handle->h_buffer_credits < credits)
-		return ocfs2_extend_trans(handle, credits);
+	if (handle->h_buffer_credits < credits) {
+		ret = ocfs2_extend_trans(handle,
+					 credits - handle->h_buffer_credits);
+		if (ret)
+			return ret;
+
+		if (unlikely(handle->h_buffer_credits < credits))
+			return ocfs2_extend_trans(handle, credits);
+	}
 
 	return 0;
 }
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 36/41] ocfs2: Use proper parameter for some inode operation.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (34 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 35/41] ocfs2: Make transaction extend more efficient Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 37/41] ocfs2: Create reflinked file in orphan dir Tao Ma
                   ` (5 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

In order to make the original function more suitable for reflink,
we modify the following inode operations. Both are tiny.

1. ocfs2_mknod_locked only use dentry for mlog, so move it to
   the caller so that reflink can use it without dentry.
2. ocfs2_prepare_orphan_dir only want inode to get its ip_blkno.
   So use ip_blkno instead.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/namei.c |   32 +++++++++++++++++---------------
 1 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c07217a..818df58 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -69,7 +69,6 @@
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct inode *inode,
-			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
@@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
-				    struct inode *inode,
+				    u64 blkno,
 				    char *name,
 				    struct ocfs2_dir_lookup_result *lookup);
 
@@ -358,8 +357,12 @@ static int ocfs2_mknod(struct inode *dir,
 	}
 	did_quota_inode = 1;
 
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
 	/* do the real work now. */
-	status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
+	status = ocfs2_mknod_locked(osb, dir, inode, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
 				    inode_ac);
 	if (status < 0) {
@@ -466,7 +469,6 @@ leave:
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct inode *inode,
-			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
@@ -480,10 +482,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	u16 suballoc_bit;
 	u16 feat;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
-		   dentry->d_name.name);
-
 	*new_fe_bh = NULL;
 
 	status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
@@ -852,7 +850,8 @@ static int ocfs2_unlink(struct inode *dir,
 	}
 
 	if (inode_is_unlinkable(inode)) {
-		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
+		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+						  OCFS2_I(inode)->ip_blkno,
 						  orphan_name, &orphan_insert);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1243,9 +1242,8 @@ static int ocfs2_rename(struct inode *old_dir,
 
 		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
 			status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
-							  new_inode,
-							  orphan_name,
-							  &orphan_insert);
+						OCFS2_I(new_inode)->ip_blkno,
+						orphan_name, &orphan_insert);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -1699,7 +1697,11 @@ static int ocfs2_symlink(struct inode *dir,
 	}
 	did_quota_inode = 1;
 
-	status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
+		   inode->i_mode, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	status = ocfs2_mknod_locked(osb, dir, inode,
 				    0, &new_fe_bh, parent_fe_bh, handle,
 				    inode_ac);
 	if (status < 0) {
@@ -1849,7 +1851,7 @@ bail:
 
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
-				    struct inode *inode,
+				    u64 blkno,
 				    char *name,
 				    struct ocfs2_dir_lookup_result *lookup)
 {
@@ -1857,7 +1859,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 	struct buffer_head *orphan_dir_bh = NULL;
 	int status = 0;
 
-	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	status = ocfs2_blkno_stringify(blkno, name);
 	if (status < 0) {
 		mlog_errno(status);
 		return status;
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 37/41] ocfs2: Create reflinked file in orphan dir.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (35 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 36/41] ocfs2: Use proper parameter for some inode operation Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 38/41] ocfs2: Add preserve to reflink Tao Ma
                   ` (4 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

reflink is a very complicated process, so it can't be integrated
into one transaction. So if the system panic in the operation, we
may leave a unfinished inode in the destication directory.

So we will try to create an inode in orphan_dir first, reflink it
to the src file and then move it to the destication file in the end.
In that way we won't be afraid of any corruption during the reflink.

This patch adds 2 functions for orphan_dir operation:
1. Create a new inode in orphand dir.
2. Move an inode to a target dir.

Note:
fsck.ocfs2 should work for us to remove the unfinished file in the
orphan_dir.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/namei.c |  268 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/namei.h |    6 +
 2 files changed, 274 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 818df58..f010b22 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2041,6 +2041,274 @@ leave:
 	return status;
 }
 
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode)
+{
+	int status, did_quota_inode = 0;
+	struct inode *inode = NULL;
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *di = NULL;
+	handle_t *handle = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *parent_di_bh = NULL;
+	struct buffer_head *new_di_bh = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+
+	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	/*
+	 * We give the orphan dir the root blkno to fake an orphan name,
+	 * and allocate enough space for our insertion.
+	 */
+	status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+					  osb->root_blkno,
+					  orphan_name, &orphan_insert);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota_inode = 1;
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, inode,
+				    0, &new_di_bh, parent_di_bh, handle,
+				    inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	di = (struct ocfs2_dinode *)new_di_bh->b_data;
+	status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+				  &orphan_insert, orphan_dir);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* get open lock so that only nodes can't remove it from orphan dir. */
+	status = ocfs2_open_lock(inode);
+	if (status < 0)
+		mlog_errno(status);
+
+leave:
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	if (status == -ENOSPC)
+		mlog(0, "Disk is full\n");
+
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	brelse(new_di_bh);
+
+	if (!status)
+		*new_inode = inode;
+
+	ocfs2_free_dir_lookup_result(&orphan_insert);
+
+	ocfs2_inode_unlock(dir, 1);
+	brelse(parent_di_bh);
+	return status;
+}
+
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *inode,
+				   struct dentry *dentry)
+{
+	int status = 0;
+	struct buffer_head *parent_di_bh = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *dir_di, *di;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+	if (!dir_di->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+
+	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+		goto leave;
+	}
+
+	status = ocfs2_read_inode_block(inode, &di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+					 di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+	di->i_orphaned_slot = 0;
+	ocfs2_journal_dirty(handle, di_bh);
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	status = ocfs2_dentry_attach_lock(dentry, inode,
+					  OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+	status = 0;
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+orphan_unlock:
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	iput(orphan_dir_inode);
+leave:
+
+	ocfs2_inode_unlock(dir, 1);
+
+	brelse(di_bh);
+	brelse(parent_di_bh);
+	brelse(orphan_dir_bh);
+
+	ocfs2_free_dir_lookup_result(&lookup);
+
+	mlog_exit(status);
+
+	return status;
+}
+
 const struct inode_operations ocfs2_dir_iops = {
 	.create		= ocfs2_create,
 	.lookup		= ocfs2_lookup,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 688aef6..e5d059d 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		     struct inode *orphan_dir_inode,
 		     struct inode *inode,
 		     struct buffer_head *orphan_dir_bh);
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode);
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *new_inode,
+				   struct dentry *new_dentry);
 
 #endif /* OCFS2_NAMEI_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 38/41] ocfs2: Add preserve to reflink.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (36 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 37/41] ocfs2: Create reflinked file in orphan dir Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 39/41] ocfs2: Implement ocfs2_reflink Tao Ma
                   ` (3 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

reflink has 2 options for the destination file:
1. snapshot: reflink will attempt to preserve ownership, permissions,
   and all other security state in order to create a full snapshot.
2. new file: it will acquire the data extent sharing but will see the
   file's security state and attributes initialized as a new file.

So add the option to ocfs2.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/refcounttree.c |   38 ++++++++++--------
 fs/ocfs2/xattr.c        |   98 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/xattr.h        |    5 ++-
 3 files changed, 119 insertions(+), 22 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4a16574..05e7f18 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3761,7 +3761,8 @@ out:
 static int ocfs2_complete_reflink(struct inode *s_inode,
 				  struct buffer_head *s_bh,
 				  struct inode *t_inode,
-				  struct buffer_head *t_bh)
+				  struct buffer_head *t_bh,
+				  bool preserve)
 {
 	int ret;
 	handle_t *handle;
@@ -3796,22 +3797,26 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
 	di->i_size = s_di->i_size;
 	di->i_dyn_features = s_di->i_dyn_features;
 	di->i_attr = s_di->i_attr;
-	di->i_uid = s_di->i_uid;
-	di->i_gid = s_di->i_gid;
-	di->i_mode = s_di->i_mode;
 
-	/*
-	 * update time.
-	 * we want mtime to appear identical to the source and update ctime.
-	 */
-	t_inode->i_ctime = CURRENT_TIME;
+	if (preserve) {
+		di->i_uid = s_di->i_uid;
+		di->i_gid = s_di->i_gid;
+		di->i_mode = s_di->i_mode;
+
+		/*
+		 * update time.
+		 * we want mtime to appear identical to the source and
+		 * update ctime.
+		 */
+		t_inode->i_ctime = CURRENT_TIME;
 
-	di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+		di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
+		di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
 
-	t_inode->i_mtime = s_inode->i_mtime;
-	di->i_mtime = s_di->i_mtime;
-	di->i_mtime_nsec = s_di->i_mtime_nsec;
+		t_inode->i_mtime = s_inode->i_mtime;
+		di->i_mtime = s_di->i_mtime;
+		di->i_mtime_nsec = s_di->i_mtime_nsec;
+	}
 
 	ocfs2_journal_dirty(handle, t_bh);
 
@@ -3823,7 +3828,8 @@ out_commit:
 static int ocfs2_create_reflink_node(struct inode *s_inode,
 				     struct buffer_head *s_bh,
 				     struct inode *t_inode,
-				     struct buffer_head *t_bh)
+				     struct buffer_head *t_bh,
+				     bool preserve)
 {
 	int ret;
 	struct buffer_head *ref_root_bh = NULL;
@@ -3858,7 +3864,7 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
 		goto out_unlock_refcount;
 	}
 
-	ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh);
+	ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve);
 	if (ret)
 		mlog_errno(ret);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4eef69b..a3eed49 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -56,6 +56,7 @@
 #include "super.h"
 #include "xattr.h"
 #include "refcounttree.h"
+#include "acl.h"
 
 struct ocfs2_xattr_def_value_root {
 	struct ocfs2_xattr_value_root	xv;
@@ -204,6 +205,8 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
 					   int offset,
 					   struct ocfs2_xattr_value_root **xv,
 					   struct buffer_head **bh);
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+				    const void *value, size_t size, int flags);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -5994,6 +5997,7 @@ out:
 	return ret;
 }
 
+typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
 /*
  * Store the information we need in xattr reflink.
  * old_bh and new_bh are inode bh for the old and new inode.
@@ -6006,6 +6010,7 @@ struct ocfs2_xattr_reflink {
 	struct ocfs2_caching_info *ref_ci;
 	struct buffer_head *ref_root_bh;
 	struct ocfs2_cached_dealloc_ctxt *dealloc;
+	should_xattr_reflinked *xattr_reflinked;
 };
 
 /*
@@ -6147,6 +6152,9 @@ out:
  * NOTE:
  * Before we call this function, the caller has memcpy the xattr in
  * old_xh to the new_xh.
+ *
+ * If args.xattr_reflinked is set, call it to decide whether the xe should
+ * be reflinked or not. If not, remove it from the new xattr header.
  */
 static int ocfs2_reflink_xattr_header(handle_t *handle,
 				      struct ocfs2_xattr_reflink *args,
@@ -6159,10 +6167,10 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
 				      get_xattr_value_root *func,
 				      void *para)
 {
-	int ret = 0, i;
+	int ret = 0, i, j;
 	struct super_block *sb = args->old_inode->i_sb;
 	struct buffer_head *value_bh;
-	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_entry *xe, *last;
 	struct ocfs2_xattr_value_root *xv, *new_xv;
 	struct ocfs2_extent_tree data_et;
 	u32 clusters, cpos, p_cluster, num_clusters;
@@ -6170,9 +6178,30 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
 
 	mlog(0, "reflink xattr in container %llu, count = %u\n",
 	     (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
-	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+
+	last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
+	for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
 		xe = &xh->xh_entries[i];
 
+		if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
+			xe = &new_xh->xh_entries[j];
+
+			le16_add_cpu(&new_xh->xh_count, -1);
+			if (new_xh->xh_count) {
+				memmove(xe, xe + 1,
+					(void *)last - (void *)xe);
+				memset(last, 0,
+				       sizeof(struct ocfs2_xattr_entry));
+			}
+
+			/*
+			 * We don't want j to increase in the next round since
+			 * it is already moved ahead.
+			 */
+			j--;
+			continue;
+		}
+
 		if (ocfs2_xattr_is_local(xe))
 			continue;
 
@@ -6182,7 +6211,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
 			break;
 		}
 
-		ret = func(sb, new_bh, new_xh, i, &new_xv, &value_bh, para);
+		ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
 		if (ret) {
 			mlog_errno(ret);
 			break;
@@ -6847,10 +6876,20 @@ out:
 	return ret;
 }
 
+static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
+{
+	int type = ocfs2_xattr_get_type(xe);
+
+	return type != OCFS2_XATTR_INDEX_SECURITY &&
+	       type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
+	       type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+}
+
 int ocfs2_reflink_xattrs(struct inode *old_inode,
 			 struct buffer_head *old_bh,
 			 struct inode *new_inode,
-			 struct buffer_head *new_bh)
+			 struct buffer_head *new_bh,
+			 bool preserve_security)
 {
 	int ret;
 	struct ocfs2_xattr_reflink args;
@@ -6878,6 +6917,10 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
 	args.ref_ci = &ref_tree->rf_ci;
 	args.ref_root_bh = ref_root_bh;
 	args.dealloc = &dealloc;
+	if (preserve_security)
+		args.xattr_reflinked = NULL;
+	else
+		args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
 
 	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
 		ret = ocfs2_reflink_xattr_inline(&args);
@@ -6918,6 +6961,51 @@ out:
 }
 
 /*
+ * Initialize security and acl for a already created inode.
+ * Used for reflink a non-preserve-security file.
+ *
+ * It uses common api like ocfs2_xattr_set, so the caller
+ * must not hold any lock expect i_mutex.
+ */
+int ocfs2_init_security_and_acl(struct inode *dir,
+				struct inode *inode)
+{
+	int ret = 0;
+	struct buffer_head *dir_bh = NULL;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
+
+	ret = ocfs2_init_security_get(inode, dir, &si);
+	if (!ret) {
+		ret = ocfs2_xattr_security_set(inode, si.name,
+					       si.value, si.value_len,
+					       XATTR_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	} else if (ret != -EOPNOTSUPP) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_inode_unlock(dir, 0);
+	brelse(dir_bh);
+leave:
+	return ret;
+}
+/*
  * 'security' attributes support
  */
 static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 4f91305..08e3638 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -93,5 +93,8 @@ int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
 int ocfs2_reflink_xattrs(struct inode *old_inode,
 			 struct buffer_head *old_bh,
 			 struct inode *new_inode,
-			 struct buffer_head *new_bh);
+			 struct buffer_head *new_bh,
+			 bool preserve_security);
+int ocfs2_init_security_and_acl(struct inode *dir,
+				struct inode *inode);
 #endif /* OCFS2_XATTR_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 39/41] ocfs2: Implement ocfs2_reflink.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (37 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 38/41] ocfs2: Add preserve to reflink Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 40/41] ocfs2: Enable refcount tree support Tao Ma
                   ` (2 subsequent siblings)
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Implement ocfs2_reflink.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/namei.c        |    2 +
 fs/ocfs2/refcounttree.c |  123 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    2 +
 3 files changed, 127 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f010b22..589ecde 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -63,6 +63,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "acl.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -2326,4 +2327,5 @@ const struct inode_operations ocfs2_dir_iops = {
 	.getxattr	= generic_getxattr,
 	.listxattr	= ocfs2_listxattr,
 	.removexattr	= generic_removexattr,
+	.reflink	= ocfs2_reflink,
 };
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 05e7f18..711d5b0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -33,6 +33,7 @@
 #include "extent_map.h"
 #include "aops.h"
 #include "xattr.h"
+#include "namei.h"
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -3879,3 +3880,125 @@ out:
 
 	return ret;
 }
+
+static int __ocfs2_reflink(struct dentry *old_dentry,
+			   struct buffer_head *old_bh,
+			   struct inode *new_inode,
+			   bool preserve)
+{
+	int ret;
+	struct inode *inode = old_dentry->d_inode;
+	struct buffer_head *new_bh = NULL;
+
+	ret = filemap_fdatawrite(inode->i_mapping);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_attach_refcount_tree(inode, old_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mutex_lock(&new_inode->i_mutex);
+	ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_create_reflink_node(inode, old_bh,
+					new_inode, new_bh, preserve);
+	if (ret) {
+		mlog_errno(ret);
+		goto inode_unlock;
+	}
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_reflink_xattrs(inode, old_bh,
+					   new_inode, new_bh,
+					   preserve);
+		if (ret)
+			mlog_errno(ret);
+	}
+inode_unlock:
+	ocfs2_inode_unlock(new_inode, 1);
+	brelse(new_bh);
+out_unlock:
+	mutex_unlock(&new_inode->i_mutex);
+out:
+	if (!ret) {
+		ret = filemap_fdatawait(inode->i_mapping);
+		if (ret)
+			mlog_errno(ret);
+	}
+	return ret;
+}
+
+int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
+		  struct dentry *new_dentry, bool preserve)
+{
+	int error;
+	struct inode *inode = old_dentry->d_inode;
+	struct buffer_head *old_bh = NULL;
+	struct inode *new_orphan_inode = NULL;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+					     &new_orphan_inode);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = ocfs2_inode_lock(inode, &old_bh, 1);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	error = __ocfs2_reflink(old_dentry, old_bh,
+				new_orphan_inode, preserve);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ocfs2_inode_unlock(inode, 1);
+	brelse(old_bh);
+
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	/* If the security isn't preserved, we need to re-initialize them. */
+	if (!preserve) {
+		error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
+		if (error)
+			mlog_errno(error);
+	}
+out:
+	if (!error) {
+		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+						       new_dentry);
+		if (error)
+			mlog_errno(error);
+	}
+
+	if (new_orphan_inode) {
+		/*
+		 * We need to open_unlock the inode no matter whether we
+		 * succeed or not, so that other nodes can delete it later.
+		 */
+		ocfs2_open_unlock(new_orphan_inode);
+		if (error)
+			iput(new_orphan_inode);
+	}
+
+	return error;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 7b769b0..89d55ef 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -99,4 +99,6 @@ int ocfs2_increase_refcount(handle_t *handle,
 			    u64 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
+		  struct dentry *new_dentry, bool preserve);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 40/41] ocfs2: Enable refcount tree support.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (38 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 39/41] ocfs2: Implement ocfs2_reflink Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 41/41] ocfs2: Add ioctl for reflink Tao Ma
  2009-08-21  1:24 ` [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Joel Becker
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/ocfs2_fs.h |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 40072cd..4a4565b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -99,7 +99,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
 					 | OCFS2_FEATURE_INCOMPAT_XATTR \
 					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
-					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
+					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 41/41] ocfs2: Add ioctl for reflink.
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (39 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 40/41] ocfs2: Enable refcount tree support Tao Ma
@ 2009-08-18  6:19 ` Tao Ma
  2009-08-21  1:24 ` [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Joel Becker
  41 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-18  6:19 UTC (permalink / raw)
  To: ocfs2-devel

The ioctl will take 3 parameters: old_path, new_path and
preserve and call vfs_reflink. It is useful when we backport
reflink features to old kernels.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/ioctl.c        |   14 ++++++
 fs/ocfs2/ocfs2_fs.h     |    9 ++++
 fs/ocfs2/refcounttree.c |  102 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    4 ++
 4 files changed, 129 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 9fcd36d..a68d0e4 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,7 @@
 #include "ocfs2_fs.h"
 #include "ioctl.h"
 #include "resize.h"
+#include "refcounttree.h"
 
 #include <linux/ext2_fs.h>
 
@@ -116,6 +117,9 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	int status;
 	struct ocfs2_space_resv sr;
 	struct ocfs2_new_group_input input;
+	struct reflink_arguments args;
+	const char *old_path, *new_path;
+	bool preserve;
 
 	switch (cmd) {
 	case OCFS2_IOC_GETFLAGS:
@@ -161,6 +165,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			return -EFAULT;
 
 		return ocfs2_group_add(inode, &input);
+	case OCFS2_IOC_REFLINK:
+		if (copy_from_user(&args, (struct reflink_arguments *)arg,
+				   sizeof(args)))
+			return -EFAULT;
+		old_path = (const char *)(unsigned long)args.old_path;
+		new_path = (const char *)(unsigned long)args.new_path;
+		preserve = (args.preserve != 0);
+
+		return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
 	default:
 		return -ENOTTY;
 	}
@@ -183,6 +196,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case OCFS2_IOC_GROUP_EXTEND:
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
+	case OCFS2_IOC_REFLINK:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 4a4565b..e9431e4 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -301,6 +301,15 @@ struct ocfs2_new_group_input {
 #define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
 #define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
 
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+	__u64 old_path;
+	__u64 new_path;
+	__u64 preserve;
+};
+#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
+
+
 /*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
  */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 711d5b0..d59860d 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -42,6 +42,11 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/swap.h>
+#include <linux/security.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
 
 struct ocfs2_cow_context {
 	struct inode *inode;
@@ -4002,3 +4007,100 @@ out:
 
 	return error;
 }
+
+/*
+ * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
+ * sys_reflink().  This will go away when vfs_reflink() exists in
+ * fs/namei.c.
+ */
+
+/* copied from may_create in VFS. */
+static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
+{
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/* copied from user_path_parent. */
+static int ocfs2_user_path_parent(const char __user *path,
+				  struct nameidata *nd, char **name)
+{
+	char *s = getname(path);
+	int error;
+
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	error = path_lookup(s, LOOKUP_PARENT, nd);
+	if (error)
+		putname(s);
+	else
+		*name = s;
+
+	return error;
+}
+
+/*
+ * Most codes are copied from sys_linkat.
+ */
+int ocfs2_reflink_ioctl(struct inode *inode,
+			const char __user *oldname,
+			const char __user *newname,
+			bool preserve)
+{
+	struct dentry *new_dentry;
+	struct nameidata nd;
+	struct path old_path;
+	int error;
+	char *to = NULL;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
+	if (error) {
+		mlog_errno(error);
+		return error;
+	}
+
+	error = ocfs2_user_path_parent(newname, &nd, &to);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = -EXDEV;
+	if (old_path.mnt != nd.path.mnt)
+		goto out_release;
+	new_dentry = lookup_create(&nd, 0);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry)) {
+		mlog_errno(error);
+		goto out_unlock;
+	}
+
+	error = mnt_want_write(nd.path.mnt);
+	if (error) {
+		mlog_errno(error);
+		goto out_dput;
+	}
+
+	error = vfs_reflink(old_path.dentry,
+			    nd.path.dentry->d_inode,
+			    new_dentry, preserve);
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(new_dentry);
+out_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out_release:
+	path_put(&nd.path);
+	putname(to);
+out:
+	path_put(&old_path);
+
+	return error;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 89d55ef..4bdf45a 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -101,4 +101,8 @@ int ocfs2_increase_refcount(handle_t *handle,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 		  struct dentry *new_dentry, bool preserve);
+int ocfs2_reflink_ioctl(struct inode *inode,
+			const char __user *oldname,
+			const char __user *newname,
+			bool preserve);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 07/41] ocfs2: Add refcount tree lock mechanism.
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 07/41] ocfs2: Add refcount tree lock mechanism Tao Ma
@ 2009-08-19 23:25   ` Joel Becker
  0 siblings, 0 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-19 23:25 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 18, 2009 at 02:19:08PM +0800, Tao Ma wrote:
> Create a tree lock named ocfs2_refcount_tree. It protects
> all the read/write operation to the refcount tree. Also
> it has its only caching_info so that we can protect our
> own buffer_head among multiple nodes.

How about:
Implement locking around struct ocfs2_refcount_tree.  This protects all
read/write operations on refcount trees.  ocfs2_refcount_tree has its
own lock and its own caching_info, protecting buffers among multiple
nodes.

> This tree is only indicated by the root refcount block number.
> So create a rb-tree for it and store the root in ocfs2_super.

How about:
ocfs2_refcount_trees are referenced by the block number of the refcount
tree root block, So we create an rb-tree on the ocfs2_super to look them
up.

> +void ocfs2_purge_refcount_tree(struct ocfs2_super *osb)

	Let's call it ocfs2_purge_refcount_trees() in the plural.

Joel

-- 

"I am working for the time when unqualified blacks, browns, and
 women join the unqualified men in running our overnment."
	- Sissy Farenthold

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 08/41] ocfs2: Basic tree root operation.
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 08/41] ocfs2: Basic tree root operation Tao Ma
@ 2009-08-19 23:30   ` Joel Becker
  0 siblings, 0 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-19 23:30 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 18, 2009 at 02:19:09PM +0800, Tao Ma wrote:
> +	spin_lock(&osb->osb_lock);
> +	tree = ocfs2_find_refcount_tree(osb, first_blkno);
> +
> +	/*
> +	 * The tree has been deleted and recreated. So remove the old
> +	 * tree in this node.
> +	 */

How about this:
	/*
	 * We've just created a new refcount tree in this block.  If
	 * we found a refcount tree on the ocfs2_super, it must be
	 * one we just deleted.  We free the old tree before
	 * inserting the new tree.
	 */
> +	BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
> +	if (tree)
> +		ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
> +	ocfs2_insert_refcount_tree(osb, new_tree);
> +	spin_unlock(&osb->osb_lock);

-- 

"Time is an illusion, lunchtime doubly so."
        -Douglas Adams

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support Tao Ma
@ 2009-08-21  0:59   ` Joel Becker
  2009-08-21  2:04     ` Tao Ma
  0 siblings, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-21  0:59 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 18, 2009 at 02:19:18PM +0800, Tao Ma wrote:
> +		if (*cow_len + leaf_clusters >= max_clusters) {
> +			if (*cow_len == 0) {
> +				/*
> +				 * cpos is in a very large extent record.
> +				 * So just split max_clusters from the
> +				 * extent record.
> +				 */
> +				if ((rec_end - cpos) <= max_clusters) {
> +					/*
> +					 * We can take max_clusters off
> +					 * the end and cover all of our
> +					 * write.
> +					 */
> +					*cow_start = rec_end - max_clusters;

	I just had a thought.  What if leaf_clusters is more than
max_clusters, but our cpos is near the end of the extent?  This only
works for the first pass (*cow_len == 0).  Imagine we have an extent of
1.5MB, and our cpos is at 1MB, and we want to write 1MB.  Clustersize of
256K.

  |<-- rec->e_cpo ==0                   leaf_clusters == 6 -->|
  |--<256>--|--<256>--|--<256>--|--<256>--|--<256>--|--<256>--|
                                          `cpos

When we get here, *cow_len is 0, *cow_start is 0 (rec->e_cpos), rec_end
is 6 (leaf_clusters), cpos is 4 because the user wants to write at 1MB
into the file, max_clusters is 4 (1MB) and write_len is 4 (1MB write).

1. *cow_len + leaf_clusters >= max_clusters
   0        + 6             >= 4  -> TRUE

2. (rec_end - cpos) <= max_clusters
   6        - 4     <= 4  -> TRUE

3. *cow_start = rec_end - max_clusters
              = 6       - 4  -> 2

4. *cow_len = max_clusters
            = 4

	This leaves us with a *cow_start of 2 and a *cow_len of 4.  This
*correctly gets us the last MB of the extent.  That's good.
However, our write was 1MB from cpos 4.  We're returning to the caller
1MB from cpos 2.  We're going to do a short write.

> +				} else if ((*cow_start + max_clusters) >
> +					   (cpos + write_len)) {

	Should this be >=?  I think it should be, and I think it's my
fault.  But check to make sure.

> +					/*
> +					 * We can take max_clusters off
> +					 * the front and cover all of
> +					 * our write.
> +					 */
> +					/* NOOP, *cow_start is already set */
> +				} else {
> +					/*
> +					 * We're CoWing more data than
> +					 * write_len for contiguousness,
> +					 * but it doesn't fit at the
> +					 * front or end of this extent.
> +					 * Let's try to slice the extent
> +					 * up nicely.  Optimally, our
> +					 * CoW region starts at a
> +					 * multiple of max_clusters.  If
> +					 * that doesn't fit, we give up
> +					 * and just CoW at cpos.
> +					 */
> +					*cow_start +=
> +						(cpos - *cow_start) &
> +							~(max_clusters - 1);
> +					if ((*cow_start + max_clusters) <
> +					    (cpos + write_len))
> +						*cow_start = cpos;
> +				}
> +			}
> +			*cow_len = max_clusters;
> +			break;
> +		} else
> +			*cow_len += leaf_clusters;
> +
> +		/*
> +		 * If we reach the end of the extent block and don't get enough
> +		 * clusters, continue with the next extent block if possible.
> +		 */
> +		if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
> +		    eb && eb->h_next_leaf_blk) {
> +			brelse(eb_bh);
> +			eb_bh = NULL;
> +
> +			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
> +					       le64_to_cpu(eb->h_next_leaf_blk),
> +					       &eb_bh);
> +			if (ret) {
> +				mlog_errno(ret);
> +				goto out;
> +			}
> +
> +			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
> +			el = &eb->h_list;
> +			i = -1;
> +		}
> +	}
> +
> +out:
> +	brelse(eb_bh);
> +	return ret;
> +}

Joel

-- 

Life's Little Instruction Book #335

	"Every so often, push your luck."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write Tao Ma
@ 2009-08-21  1:04   ` Joel Becker
  2009-08-21  2:12     ` Tao Ma
  2009-08-21 21:12   ` Joel Becker
  1 sibling, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-21  1:04 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 18, 2009 at 02:19:20PM +0800, Tao Ma wrote:
> @@ -1674,6 +1692,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>  	struct ocfs2_alloc_context *meta_ac = NULL;
>  	handle_t *handle;
>  	struct ocfs2_extent_tree et;
> +	unsigned int refcounted_cpos, cow_len;
>  
>  	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
>  	if (ret) {
> @@ -1701,12 +1720,36 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>  	}
>  
>  	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
> -					&extents_to_split);
> -	if (ret) {
> +					&extents_to_split, &refcounted_cpos);
> +	if (ret && ret != -ETXTBSY) {
>  		mlog_errno(ret);
>  		goto out;
>  	}
>  
> +	if (ret == -ETXTBSY) {
> +		BUG_ON(refcounted_cpos == UINT_MAX);
> +		cow_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
> +
> +		ret = ocfs2_refcount_cow(inode, di_bh,
> +					 refcounted_cpos, cow_len);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		/* reinitialize write_desc and populate it again. */
> +		ocfs2_clear_write_desc(wc);
> +		ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
> +						&extents_to_split,
> +						&refcounted_cpos);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +
> +		BUG_ON(refcounted_cpos != UINT_MAX);
> +	}

	First see my comment on the ocfs2_refcount_cal_clusters() and a
possible short CoW.
	Basically, ocfs2_refcount_cow() can do a short CoW in two
situations.  First, the one I described about
ocfs2_refcount_cal_clusters().  The second is if the write wants to
cover three extents, the middle of which is not refcounted:

  |--refcounted--|--not refcounted--]--refcounted--|

ocfs2_refcount_cal_cow_clusters() will be called on the first extent.
It will set up *cow_len to cover that first extent, but then it will hit
the second extent and break out.  ocfs2_refcount_cow() will then replace
the first extent.  We end up with:

  |--not refcounted--|--not refcounted--]--refcounted--|


	Now we call to ocfs2_prepare_write_desc() again.  It will see
the first and second extents as not refcounted, but then it will hit
the third extent and return -ETXTBSY.  refcounted_cpos will be set to
the third extent.  Then in ocfs2_write_begin_nolock(), we'll hit:

		BUG_ON(refcounted_cpos != UINT_MAX);

Am I missing something?

Joel

-- 

"Where are my angels?
 Where's my golden one?
 And where is my hope
 Now that my heroes are gone?"

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 27/41] ocfs2: Abstract the creation of xattr block.
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 27/41] ocfs2: Abstract the creation of xattr block Tao Ma
@ 2009-08-21  1:22   ` Joel Becker
  0 siblings, 0 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-21  1:22 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 18, 2009 at 02:19:28PM +0800, Tao Ma wrote:
> In xattr reflink, we also need to create xattr block, so
> abstract the process out.

	Haha, I did this in the xa_loc changes too :-)  But yours will
go in first and I'll just drop mine.

Joel

-- 

"Vote early and vote often." 
        - Al Capone

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4
  2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
                   ` (40 preceding siblings ...)
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 41/41] ocfs2: Add ioctl for reflink Tao Ma
@ 2009-08-21  1:24 ` Joel Becker
  2009-08-21  1:39   ` Tao Ma
  2009-08-24 23:11   ` TaoMa
  41 siblings, 2 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-21  1:24 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 18, 2009 at 02:19:11PM +0800, Tao Ma wrote:
> Hi all,
>     So I have finally finished the v4 of reflink for ocfs2. Great
> thanks for Joel's review.

	Ok, I'm done with the review.  I basically skimmed or skipped
all the patches I already liked.  You didn't change anything outside of
responding to reviews of v3, right?
	We're in great shape.  Hopefully you'll be able to answer my
questions and comments tomorrow and we can get this sucker into
merge-window Monday or Tuesday!

Joel

-- 

"Only a life lived for others is a life worth while."
							-Albert Einstein  

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4
  2009-08-21  1:24 ` [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Joel Becker
@ 2009-08-21  1:39   ` Tao Ma
  2009-08-24 23:11   ` TaoMa
  1 sibling, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-21  1:39 UTC (permalink / raw)
  To: ocfs2-devel



Joel Becker wrote:
> On Tue, Aug 18, 2009 at 02:19:11PM +0800, Tao Ma wrote:
>> Hi all,
>>     So I have finally finished the v4 of reflink for ocfs2. Great
>> thanks for Joel's review.
> 
> 	Ok, I'm done with the review.  I basically skimmed or skipped
> all the patches I already liked.  You didn't change anything outside of
> responding to reviews of v3, right?
sure, I don't change other things except your review advice. again, 
thanks for the review.
> 	We're in great shape.  Hopefully you'll be able to answer my
> questions and comments tomorrow and we can get this sucker into
> merge-window Monday or Tuesday!
I am now reading your comments. Hopefully we can get it in ASAP.

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  0:59   ` Joel Becker
@ 2009-08-21  2:04     ` Tao Ma
  2009-08-21  2:51       ` Joel Becker
  0 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-21  2:04 UTC (permalink / raw)
  To: ocfs2-devel



Joel Becker wrote:
> On Tue, Aug 18, 2009 at 02:19:18PM +0800, Tao Ma wrote:
>> +		if (*cow_len + leaf_clusters >= max_clusters) {
>> +			if (*cow_len == 0) {
>> +				/*
>> +				 * cpos is in a very large extent record.
>> +				 * So just split max_clusters from the
>> +				 * extent record.
>> +				 */
>> +				if ((rec_end - cpos) <= max_clusters) {
>> +					/*
>> +					 * We can take max_clusters off
>> +					 * the end and cover all of our
>> +					 * write.
>> +					 */
>> +					*cow_start = rec_end - max_clusters;
> 
> 	I just had a thought.  What if leaf_clusters is more than
> max_clusters, but our cpos is near the end of the extent?  This only
> works for the first pass (*cow_len == 0).  Imagine we have an extent of
> 1.5MB, and our cpos is at 1MB, and we want to write 1MB.  Clustersize of
> 256K.
> 
>   |<-- rec->e_cpo ==0                   leaf_clusters == 6 -->|
>   |--<256>--|--<256>--|--<256>--|--<256>--|--<256>--|--<256>--|
>                                           `cpos
> 
> When we get here, *cow_len is 0, *cow_start is 0 (rec->e_cpos), rec_end
> is 6 (leaf_clusters), cpos is 4 because the user wants to write at 1MB
> into the file, max_clusters is 4 (1MB) and write_len is 4 (1MB write).
> 
> 1. *cow_len + leaf_clusters >= max_clusters
>    0        + 6             >= 4  -> TRUE
> 
> 2. (rec_end - cpos) <= max_clusters
>    6        - 4     <= 4  -> TRUE
> 
> 3. *cow_start = rec_end - max_clusters
>               = 6       - 4  -> 2
> 
> 4. *cow_len = max_clusters
>             = 4
> 
> 	This leaves us with a *cow_start of 2 and a *cow_len of 4.  This
> *correctly gets us the last MB of the extent.  That's good.
> However, our write was 1MB from cpos 4.  We're returning to the caller
> 1MB from cpos 2.  We're going to do a short write.
oh, yes, this is really a bug. I guess the reason why tristan's test 
case can't find it is that we are now called from ocfs2_write_begin. 
Normally the write_len will at most be a PAGE_SIZE and the start is 
aligned to PAGE_SIZE also. With our x86_64 boxes, PAGE_SIZE=4096, we 
always CoW the right pos. But with PAGE_SIZE=64k, it will be exposed.
So how about change the first check to:
			if (((rec_end - cpos) <= max_clusters) &&
			    (cpos + write_len <= rec_end)) {

> 
>> +				} else if ((*cow_start + max_clusters) >
>> +					   (cpos + write_len)) {
> 
> 	Should this be >=?  I think it should be, and I think it's my
> fault.  But check to make sure.
yeah, ">=" will be more accurate. Actually, with ">", we can survive in 
a less efficient way since the "else" will cover this case and CoW a 
different part(start from another pos).

Regards,
Tao
> 
>> +					/*
>> +					 * We can take max_clusters off
>> +					 * the front and cover all of
>> +					 * our write.
>> +					 */
>> +					/* NOOP, *cow_start is already set */
>> +				} else {
>> +					/*
>> +					 * We're CoWing more data than
>> +					 * write_len for contiguousness,
>> +					 * but it doesn't fit at the
>> +					 * front or end of this extent.
>> +					 * Let's try to slice the extent
>> +					 * up nicely.  Optimally, our
>> +					 * CoW region starts at a
>> +					 * multiple of max_clusters.  If
>> +					 * that doesn't fit, we give up
>> +					 * and just CoW at cpos.
>> +					 */
>> +					*cow_start +=
>> +						(cpos - *cow_start) &
>> +							~(max_clusters - 1);
>> +					if ((*cow_start + max_clusters) <
>> +					    (cpos + write_len))
>> +						*cow_start = cpos;
>> +				}
>> +			}
>> +			*cow_len = max_clusters;
>> +			break;
>> +		} else
>> +			*cow_len += leaf_clusters;
>> +
>> +		/*
>> +		 * If we reach the end of the extent block and don't get enough
>> +		 * clusters, continue with the next extent block if possible.
>> +		 */
>> +		if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
>> +		    eb && eb->h_next_leaf_blk) {
>> +			brelse(eb_bh);
>> +			eb_bh = NULL;
>> +
>> +			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
>> +					       le64_to_cpu(eb->h_next_leaf_blk),
>> +					       &eb_bh);
>> +			if (ret) {
>> +				mlog_errno(ret);
>> +				goto out;
>> +			}
>> +
>> +			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
>> +			el = &eb->h_list;
>> +			i = -1;
>> +		}
>> +	}
>> +
>> +out:
>> +	brelse(eb_bh);
>> +	return ret;
>> +}
> 
> Joel
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-21  1:04   ` Joel Becker
@ 2009-08-21  2:12     ` Tao Ma
  2009-08-21 14:55       ` Tao Ma
  0 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-21  2:12 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joel,

Joel Becker wrote:
> On Tue, Aug 18, 2009 at 02:19:20PM +0800, Tao Ma wrote:
>> @@ -1674,6 +1692,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>>  	struct ocfs2_alloc_context *meta_ac = NULL;
>>  	handle_t *handle;
>>  	struct ocfs2_extent_tree et;
>> +	unsigned int refcounted_cpos, cow_len;
>>  
>>  	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
>>  	if (ret) {
>> @@ -1701,12 +1720,36 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
>>  	}
>>  
>>  	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
>> -					&extents_to_split);
>> -	if (ret) {
>> +					&extents_to_split, &refcounted_cpos);
>> +	if (ret && ret != -ETXTBSY) {
>>  		mlog_errno(ret);
>>  		goto out;
>>  	}
>>  
>> +	if (ret == -ETXTBSY) {
>> +		BUG_ON(refcounted_cpos == UINT_MAX);
>> +		cow_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
>> +
>> +		ret = ocfs2_refcount_cow(inode, di_bh,
>> +					 refcounted_cpos, cow_len);
>> +		if (ret) {
>> +			mlog_errno(ret);
>> +			goto out;
>> +		}
>> +
>> +		/* reinitialize write_desc and populate it again. */
>> +		ocfs2_clear_write_desc(wc);
>> +		ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
>> +						&extents_to_split,
>> +						&refcounted_cpos);
>> +		if (ret) {
>> +			mlog_errno(ret);
>> +			goto out;
>> +		}
>> +
>> +		BUG_ON(refcounted_cpos != UINT_MAX);
>> +	}
> 
> 	First see my comment on the ocfs2_refcount_cal_clusters() and a
> possible short CoW.
> 	Basically, ocfs2_refcount_cow() can do a short CoW in two
> situations.  First, the one I described about
> ocfs2_refcount_cal_clusters().  The second is if the write wants to
> cover three extents, the middle of which is not refcounted:
> 
>   |--refcounted--|--not refcounted--]--refcounted--|
> 
> ocfs2_refcount_cal_cow_clusters() will be called on the first extent.
> It will set up *cow_len to cover that first extent, but then it will hit
> the second extent and break out.  ocfs2_refcount_cow() will then replace
> the first extent.  We end up with:
> 
>   |--not refcounted--|--not refcounted--]--refcounted--|
> 
> 
> 	Now we call to ocfs2_prepare_write_desc() again.  It will see
> the first and second extents as not refcounted, but then it will hit
> the third extent and return -ETXTBSY.  refcounted_cpos will be set to
> the third extent.  Then in ocfs2_write_begin_nolock(), we'll hit:
> 
> 		BUG_ON(refcounted_cpos != UINT_MAX);
> 
> Am I missing something?
you are right.

ocfs2_write_begin normally only write one page and with my x86_64 box, 
we have PAGE_SIZE <= CLUSTER_SIZE, so we never hit this issue before.
So maybe I have to add a new function which will iterate from pos to 
pos+len to make sure all the refcounted extents are CoWed. Thanks.

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  2:04     ` Tao Ma
@ 2009-08-21  2:51       ` Joel Becker
  2009-08-21  3:04         ` Tao Ma
  2009-08-21  3:55         ` Joel Becker
  0 siblings, 2 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-21  2:51 UTC (permalink / raw)
  To: ocfs2-devel

On Fri, Aug 21, 2009 at 10:04:18AM +0800, Tao Ma wrote:
> Joel Becker wrote:
> > On Tue, Aug 18, 2009 at 02:19:18PM +0800, Tao Ma wrote:
> >> +		if (*cow_len + leaf_clusters >= max_clusters) {
> >> +			if (*cow_len == 0) {
> >> +				/*
> >> +				 * cpos is in a very large extent record.
> >> +				 * So just split max_clusters from the
> >> +				 * extent record.
> >> +				 */
> >> +				if ((rec_end - cpos) <= max_clusters) {
> >> +					/*
> >> +					 * We can take max_clusters off
> >> +					 * the end and cover all of our
> >> +					 * write.
> >> +					 */
> >> +					*cow_start = rec_end - max_clusters;
> oh, yes, this is really a bug. I guess the reason why tristan's test 
> case can't find it is that we are now called from ocfs2_write_begin. 
> Normally the write_len will at most be a PAGE_SIZE and the start is 
> aligned to PAGE_SIZE also. With our x86_64 boxes, PAGE_SIZE=4096, we 
> always CoW the right pos. But with PAGE_SIZE=64k, it will be exposed.
> So how about change the first check to:
> 			if (((rec_end - cpos) <= max_clusters) &&
> 			    (cpos + write_len <= rec_end)) {

	This doesn't quite work either, because we'll then fall to the
else{, which is for ranges in the middle of a big cluster.
	Essentially, the problem is one of max_clusters not seeing the
extra stuff we've done on the front of the I/O.  And I think that comes
down to the confusion of max_clusters vs write_len.  We want to CoW at
least cpos+write_len, but we want large extents broken on the 1MB
boundary. 
	We have another problem.  What if we have two refcounted
extents each of 1MB in size.  We're doing a 1.5MB starting@the
beginning of the first extent.  First we get 1MB from the first extent.
We go back around the loop to the second extent.  We see that *cow_len +
leaf_clusters (1MB + 1MB) is greater than the 1.5MB we're trying to
write.  So we set *cow_len to our 1.5MB.  But this is going to break the
second extent up into two .5MB extents.  What we really want is to cow
that entire second 1MB extent.

> >> +				} else if ((*cow_start + max_clusters) >
> >> +					   (cpos + write_len)) {
> > 
> > 	Should this be >=?  I think it should be, and I think it's my
> > fault.  But check to make sure.
> yeah, ">=" will be more accurate. Actually, with ">", we can survive in 
> a less efficient way since the "else" will cover this case and CoW a 
> different part(start from another pos).

	Let's do the >=, which is best.

	I'm halfway through a modification of this code that splits out
MAX_COW_BYTES from write_len.  Let me finish it tomorrow.

Joel

-- 

"Conservative, n.  A statesman who is enamoured of existing evils,
 as distinguished from the Liberal, who wishes to replace them
 with others."
	- Ambrose Bierce, The Devil's Dictionary

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  2:51       ` Joel Becker
@ 2009-08-21  3:04         ` Tao Ma
  2009-08-21  7:10           ` Joel Becker
  2009-08-21  3:55         ` Joel Becker
  1 sibling, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-21  3:04 UTC (permalink / raw)
  To: ocfs2-devel



Joel Becker wrote:
> On Fri, Aug 21, 2009 at 10:04:18AM +0800, Tao Ma wrote:
>> Joel Becker wrote:
>>> On Tue, Aug 18, 2009 at 02:19:18PM +0800, Tao Ma wrote:
>>>> +		if (*cow_len + leaf_clusters >= max_clusters) {
>>>> +			if (*cow_len == 0) {
>>>> +				/*
>>>> +				 * cpos is in a very large extent record.
>>>> +				 * So just split max_clusters from the
>>>> +				 * extent record.
>>>> +				 */
>>>> +				if ((rec_end - cpos) <= max_clusters) {
>>>> +					/*
>>>> +					 * We can take max_clusters off
>>>> +					 * the end and cover all of our
>>>> +					 * write.
>>>> +					 */
>>>> +					*cow_start = rec_end - max_clusters;
>> oh, yes, this is really a bug. I guess the reason why tristan's test 
>> case can't find it is that we are now called from ocfs2_write_begin. 
>> Normally the write_len will at most be a PAGE_SIZE and the start is 
>> aligned to PAGE_SIZE also. With our x86_64 boxes, PAGE_SIZE=4096, we 
>> always CoW the right pos. But with PAGE_SIZE=64k, it will be exposed.
>> So how about change the first check to:
>> 			if (((rec_end - cpos) <= max_clusters) &&
>> 			    (cpos + write_len <= rec_end)) {
> 
> 	This doesn't quite work either, because we'll then fall to the
> else{, which is for ranges in the middle of a big cluster.
> 	Essentially, the problem is one of max_clusters not seeing the
> extra stuff we've done on the front of the I/O.  And I think that comes
> down to the confusion of max_clusters vs write_len.  We want to CoW at
> least cpos+write_len, but we want large extents broken on the 1MB
> boundary. 
> 	We have another problem.  What if we have two refcounted
> extents each of 1MB in size.  We're doing a 1.5MB starting at the
> beginning of the first extent.  First we get 1MB from the first extent.
> We go back around the loop to the second extent.  We see that *cow_len +
> leaf_clusters (1MB + 1MB) is greater than the 1.5MB we're trying to
> write.  So we set *cow_len to our 1.5MB.  But this is going to break the
> second extent up into two .5MB extents.  What we really want is to cow
> that entire second 1MB extent.
oh, actually I drafted up this code in the hypothesis that write_len <= 
max_clusters, so if we want to write 1.5MB, I am afraid we need to 
rethink it carefully.
> 
>>>> +				} else if ((*cow_start + max_clusters) >
>>>> +					   (cpos + write_len)) {
>>> 	Should this be >=?  I think it should be, and I think it's my
>>> fault.  But check to make sure.
>> yeah, ">=" will be more accurate. Actually, with ">", we can survive in 
>> a less efficient way since the "else" will cover this case and CoW a 
>> different part(start from another pos).
> 
> 	Let's do the >=, which is best.
ok.
> 
> 	I'm halfway through a modification of this code that splits out
> MAX_COW_BYTES from write_len.  Let me finish it tomorrow.
really? that would be cool. and then my hypothesis mentioned above will 
not be a problem.

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  2:51       ` Joel Becker
  2009-08-21  3:04         ` Tao Ma
@ 2009-08-21  3:55         ` Joel Becker
  2009-08-21  6:25           ` Tao Ma
  1 sibling, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-21  3:55 UTC (permalink / raw)
  To: ocfs2-devel

On Thu, Aug 20, 2009 at 07:51:36PM -0700, Joel Becker wrote:
> 	I'm halfway through a modification of this code that splits out
> MAX_COW_BYTES from write_len.  Let me finish it tomorrow.

	I just did it.  What do you think?

Joel

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d59860d..7790e1d 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2499,7 +2499,7 @@ out:
 	return ret;
 }
 
-#define	MAX_COW_BYTES	1048576
+#define	MAX_CONTIG_BYTES	1048576
 /*
  * Calculate out the start and number of virtual clusters we need to to CoW.
  *
@@ -2508,9 +2508,8 @@ out:
  * max_cpos is the place where we want to stop CoW intentionally.
  *
  * Normal we will start CoW from the beginning of extent record cotaining cpos.
- * And We will try to Cow as much clusters as we can until we reach
- * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
- * use that value as the maximum clusters.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
  */
 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 					   struct ocfs2_extent_list *el,
@@ -2525,10 +2524,11 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_extent_block *eb = NULL;
 	struct ocfs2_extent_rec *rec;
-	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
+	int want_clusters;
+	int contig_clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb, MAX_CONTIG_BYTES);
 	int leaf_clusters, rec_end = 0;
 
-	max_clusters = max_clusters < write_len ? write_len : max_clusters;
 	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
 		if (ret) {
@@ -2587,53 +2587,84 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
 		}
 
-		if (*cow_len + leaf_clusters >= max_clusters) {
-			if (*cow_len == 0) {
-				/*
-				 * cpos is in a very large extent record.
-				 * So just split max_clusters from the
-				 * extent record.
-				 */
-				if ((rec_end - cpos) <= max_clusters) {
-					/*
-					 * We can take max_clusters off
-					 * the end and cover all of our
-					 * write.
-					 */
-					*cow_start = rec_end - max_clusters;
-				} else if ((*cow_start + max_clusters) >
-					   (cpos + write_len)) {
-					/*
-					 * We can take max_clusters off
-					 * the front and cover all of
-					 * our write.
-					 */
-					/* NOOP, *cow_start is already set */
-				} else {
-					/*
-					 * We're CoWing more data than
-					 * write_len for contiguousness,
-					 * but it doesn't fit at the
-					 * front or end of this extent.
-					 * Let's try to slice the extent
-					 * up nicely.  Optimally, our
-					 * CoW region starts at a
-					 * multiple of max_clusters.  If
-					 * that doesn't fit, we give up
-					 * and just CoW at cpos.
-					 */
-					*cow_start +=
-						(cpos - *cow_start) &
-							~(max_clusters - 1);
-					if ((*cow_start + max_clusters) <
-					    (cpos + write_len))
-						*cow_start = cpos;
-				}
-			}
-			*cow_len = max_clusters;
-			break;
-		} else
+		/*
+		 * How many clusters do we actually need from
+		 * this extent?  First we see how many we actually
+		 * need to complete the write.  If that's smaller
+		 * than contig_clusters, we try for
+		 * contig_clustes.
+		 */
+		if (!*cow_len)
+			want_clusters = write_len;
+		else
+			want_clusters = (cpos + write_len) -
+				(*cow_start + *cow_len);
+		if (want_clusters < contig_clusters)
+			want_clusters = contig_clusters;
+
+		/*
+		 * If the write does not cover the whole extent, we
+		 * need to calculate how we're going to split the extent.
+		 * We try to do it on contig_clusters boundaries.
+		 *
+		 * Any extent smaller than contig_clusters will be
+		 * CoWed in its entirety.
+		 */
+		if (leaf_clusters < contig_clusters)
 			*cow_len += leaf_clusters;
+		else if (*cow_len || (*cow_start == cpos)) {
+			/*
+			 * This extent needs to be CoW'd from its
+			 * beginning, so all we have to do is compute
+			 * how many clusters to grab.
+			 */
+			if (leaf_clusters < want_clusters)
+				*cow_len += leaf_clusters;
+			else
+				*cow_len += want_clusters;
+		} else if ((*cow_start + contig_clusters) >
+			   (cpos + write_len)) {
+			/*
+			 * Breaking off contig_clusters at the front
+			 * of the extent will cover our write.  That's
+			 * easy.
+			 */
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= contig_clusters) {
+			/*
+			 * Breaking off contig_clusters at the tail of
+			 * this extent will cover cpos.
+			 */
+			*cow_start = rec_end - cpos;
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= want_clusters) {
+			/*
+			 * While we can't fit the entire write in this
+			 * extent, we know that the write goes from cpos
+			 * to the end of the extent.  Break that off.
+			 */
+			*cow_start = cpos;
+			*cow_len = rec_end - cpos;
+		} else {
+			/*
+			 * Ok, the entire write lives in the middle of
+			 * this extent.
+			 * Let's try to slice the extentup nicely.
+			 * Optimally, our CoW region starts at a
+			 * multiple of contig_clusters.  If that doesn't
+			 * fit, we give up and just CoW@cpos.
+			 */
+			*cow_start += (cpos - *cow_start) &
+				~(contig_clusters - 1);
+			if ((*cow_start + want_clusters) <
+			    (cpos + write_len))
+				*cow_start = cpos;
+			*cow_len = want_clusters;
+		}
+
+		/* Have we covered our entire write yet? */
+		if ((*cow_start + *cow_len) >= (cpos + write_len))
+			break;
 
 		/*
 		 * If we reach the end of the extent block and don't get enough


-- 

Life's Little Instruction Book #450

	"Don't be afraid to say, 'I need help.'"

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  3:55         ` Joel Becker
@ 2009-08-21  6:25           ` Tao Ma
  2009-08-21  7:07             ` Joel Becker
  0 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-21  6:25 UTC (permalink / raw)
  To: ocfs2-devel



Joel Becker wrote:
> On Thu, Aug 20, 2009 at 07:51:36PM -0700, Joel Becker wrote:
>> 	I'm halfway through a modification of this code that splits out
>> MAX_COW_BYTES from write_len.  Let me finish it tomorrow.
> 
> 	I just did it.  What do you think?
> 
> Joel
> 
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index d59860d..7790e1d 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -2499,7 +2499,7 @@ out:
>  	return ret;
>  }
>  
> -#define	MAX_COW_BYTES	1048576
> +#define	MAX_CONTIG_BYTES	1048576
>  /*
>   * Calculate out the start and number of virtual clusters we need to to CoW.
>   *
> @@ -2508,9 +2508,8 @@ out:
>   * max_cpos is the place where we want to stop CoW intentionally.
>   *
>   * Normal we will start CoW from the beginning of extent record cotaining cpos.
> - * And We will try to Cow as much clusters as we can until we reach
> - * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
> - * use that value as the maximum clusters.
> + * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
> + * get good I/O from the resulting extent tree.
>   */
>  static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
>  					   struct ocfs2_extent_list *el,
> @@ -2525,10 +2524,11 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
>  	struct buffer_head *eb_bh = NULL;
>  	struct ocfs2_extent_block *eb = NULL;
>  	struct ocfs2_extent_rec *rec;
> -	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
> +	int want_clusters;
> +	int contig_clusters =
> +		ocfs2_clusters_for_bytes(inode->i_sb, MAX_CONTIG_BYTES);
>  	int leaf_clusters, rec_end = 0;
>  
> -	max_clusters = max_clusters < write_len ? write_len : max_clusters;
>  	if (tree_height > 0) {
>  		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
>  		if (ret) {
> @@ -2587,53 +2587,84 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
>  			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
>  		}
>  
> -		if (*cow_len + leaf_clusters >= max_clusters) {
> -			if (*cow_len == 0) {
> -				/*
> -				 * cpos is in a very large extent record.
> -				 * So just split max_clusters from the
> -				 * extent record.
> -				 */
> -				if ((rec_end - cpos) <= max_clusters) {
> -					/*
> -					 * We can take max_clusters off
> -					 * the end and cover all of our
> -					 * write.
> -					 */
> -					*cow_start = rec_end - max_clusters;
> -				} else if ((*cow_start + max_clusters) >
> -					   (cpos + write_len)) {
> -					/*
> -					 * We can take max_clusters off
> -					 * the front and cover all of
> -					 * our write.
> -					 */
> -					/* NOOP, *cow_start is already set */
> -				} else {
> -					/*
> -					 * We're CoWing more data than
> -					 * write_len for contiguousness,
> -					 * but it doesn't fit at the
> -					 * front or end of this extent.
> -					 * Let's try to slice the extent
> -					 * up nicely.  Optimally, our
> -					 * CoW region starts at a
> -					 * multiple of max_clusters.  If
> -					 * that doesn't fit, we give up
> -					 * and just CoW at cpos.
> -					 */
> -					*cow_start +=
> -						(cpos - *cow_start) &
> -							~(max_clusters - 1);
> -					if ((*cow_start + max_clusters) <
> -					    (cpos + write_len))
> -						*cow_start = cpos;
> -				}
> -			}
> -			*cow_len = max_clusters;
> -			break;
> -		} else
> +		/*
> +		 * How many clusters do we actually need from
> +		 * this extent?  First we see how many we actually
> +		 * need to complete the write.  If that's smaller
> +		 * than contig_clusters, we try for
> +		 * contig_clustes.
> +		 */
> +		if (!*cow_len)
> +			want_clusters = write_len;
> +		else
> +			want_clusters = (cpos + write_len) -
> +				(*cow_start + *cow_len);
> +		if (want_clusters < contig_clusters)
> +			want_clusters = contig_clusters;
> +
> +		/*
> +		 * If the write does not cover the whole extent, we
> +		 * need to calculate how we're going to split the extent.
> +		 * We try to do it on contig_clusters boundaries.
> +		 *
> +		 * Any extent smaller than contig_clusters will be
> +		 * CoWed in its entirety.
> +		 */
> +		if (leaf_clusters < contig_clusters)
"<=" ?
>  			*cow_len += leaf_clusters;
> +		else if (*cow_len || (*cow_start == cpos)) {
> +			/*
> +			 * This extent needs to be CoW'd from its
> +			 * beginning, so all we have to do is compute
> +			 * how many clusters to grab.
> +			 */
> +			if (leaf_clusters < want_clusters)
> +				*cow_len += leaf_clusters;
> +			else
> +				*cow_len += want_clusters;
> +		} else if ((*cow_start + contig_clusters) >
> +			   (cpos + write_len)) {
">="?

otherwise, it looks good to me.
btw, you will create a separate patch for this or I should integrate it 
into my original one?

Regards,
Tao

> +			/*
> +			 * Breaking off contig_clusters at the front
> +			 * of the extent will cover our write.  That's
> +			 * easy.
> +			 */
> +			*cow_len = contig_clusters;
> +		} else if ((rec_end - cpos) <= contig_clusters) {
> +			/*
> +			 * Breaking off contig_clusters at the tail of
> +			 * this extent will cover cpos.
> +			 */
> +			*cow_start = rec_end - cpos;
> +			*cow_len = contig_clusters;
> +		} else if ((rec_end - cpos) <= want_clusters) {
> +			/*
> +			 * While we can't fit the entire write in this
> +			 * extent, we know that the write goes from cpos
> +			 * to the end of the extent.  Break that off.
> +			 */
> +			*cow_start = cpos;
> +			*cow_len = rec_end - cpos;
> +		} else {
> +			/*
> +			 * Ok, the entire write lives in the middle of
> +			 * this extent.
> +			 * Let's try to slice the extentup nicely.
> +			 * Optimally, our CoW region starts at a
> +			 * multiple of contig_clusters.  If that doesn't
> +			 * fit, we give up and just CoW at cpos.
> +			 */
> +			*cow_start += (cpos - *cow_start) &
> +				~(contig_clusters - 1);
> +			if ((*cow_start + want_clusters) <
> +			    (cpos + write_len))
> +				*cow_start = cpos;
> +			*cow_len = want_clusters;
> +		}
> +
> +		/* Have we covered our entire write yet? */
> +		if ((*cow_start + *cow_len) >= (cpos + write_len))
> +			break;
>  
>  		/*
>  		 * If we reach the end of the extent block and don't get enough
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  6:25           ` Tao Ma
@ 2009-08-21  7:07             ` Joel Becker
  2009-08-21  8:24               ` Tao Ma
  0 siblings, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-21  7:07 UTC (permalink / raw)
  To: ocfs2-devel

On Fri, Aug 21, 2009 at 02:25:35PM +0800, Tao Ma wrote:
> Joel Becker wrote:
> >+		if (leaf_clusters < contig_clusters)
> "<=" ?

	Of course.

> >+		} else if ((*cow_start + contig_clusters) >
> >+			   (cpos + write_len)) {
> ">="?

	Haha!  I started from my email where I pointed out the same
thing, and I screwed it up!

> otherwise, it looks good to me.
> btw, you will create a separate patch for this or I should integrate
> it into my original one?

	I'd say integrate it.  Here's the patch with the >= fixups:

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d59860d..89f2fc8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2499,7 +2499,7 @@ out:
 	return ret;
 }
 
-#define	MAX_COW_BYTES	1048576
+#define	MAX_CONTIG_BYTES	1048576
 /*
  * Calculate out the start and number of virtual clusters we need to to CoW.
  *
@@ -2508,9 +2508,8 @@ out:
  * max_cpos is the place where we want to stop CoW intentionally.
  *
  * Normal we will start CoW from the beginning of extent record cotaining cpos.
- * And We will try to Cow as much clusters as we can until we reach
- * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
- * use that value as the maximum clusters.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
  */
 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 					   struct ocfs2_extent_list *el,
@@ -2525,10 +2524,11 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_extent_block *eb = NULL;
 	struct ocfs2_extent_rec *rec;
-	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
+	int want_clusters;
+	int contig_clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb, MAX_CONTIG_BYTES);
 	int leaf_clusters, rec_end = 0;
 
-	max_clusters = max_clusters < write_len ? write_len : max_clusters;
 	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
 		if (ret) {
@@ -2587,53 +2587,84 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
 		}
 
-		if (*cow_len + leaf_clusters >= max_clusters) {
-			if (*cow_len == 0) {
-				/*
-				 * cpos is in a very large extent record.
-				 * So just split max_clusters from the
-				 * extent record.
-				 */
-				if ((rec_end - cpos) <= max_clusters) {
-					/*
-					 * We can take max_clusters off
-					 * the end and cover all of our
-					 * write.
-					 */
-					*cow_start = rec_end - max_clusters;
-				} else if ((*cow_start + max_clusters) >
-					   (cpos + write_len)) {
-					/*
-					 * We can take max_clusters off
-					 * the front and cover all of
-					 * our write.
-					 */
-					/* NOOP, *cow_start is already set */
-				} else {
-					/*
-					 * We're CoWing more data than
-					 * write_len for contiguousness,
-					 * but it doesn't fit at the
-					 * front or end of this extent.
-					 * Let's try to slice the extent
-					 * up nicely.  Optimally, our
-					 * CoW region starts at a
-					 * multiple of max_clusters.  If
-					 * that doesn't fit, we give up
-					 * and just CoW at cpos.
-					 */
-					*cow_start +=
-						(cpos - *cow_start) &
-							~(max_clusters - 1);
-					if ((*cow_start + max_clusters) <
-					    (cpos + write_len))
-						*cow_start = cpos;
-				}
-			}
-			*cow_len = max_clusters;
-			break;
-		} else
+		/*
+		 * How many clusters do we actually need from
+		 * this extent?  First we see how many we actually
+		 * need to complete the write.  If that's smaller
+		 * than contig_clusters, we try for
+		 * contig_clustes.
+		 */
+		if (!*cow_len)
+			want_clusters = write_len;
+		else
+			want_clusters = (cpos + write_len) -
+				(*cow_start + *cow_len);
+		if (want_clusters < contig_clusters)
+			want_clusters = contig_clusters;
+
+		/*
+		 * If the write does not cover the whole extent, we
+		 * need to calculate how we're going to split the extent.
+		 * We try to do it on contig_clusters boundaries.
+		 *
+		 * Any extent smaller than contig_clusters will be
+		 * CoWed in its entirety.
+		 */
+		if (leaf_clusters <= contig_clusters)
 			*cow_len += leaf_clusters;
+		else if (*cow_len || (*cow_start == cpos)) {
+			/*
+			 * This extent needs to be CoW'd from its
+			 * beginning, so all we have to do is compute
+			 * how many clusters to grab.
+			 */
+			if (leaf_clusters < want_clusters)
+				*cow_len += leaf_clusters;
+			else
+				*cow_len += want_clusters;
+		} else if ((*cow_start + contig_clusters) >=
+			   (cpos + write_len)) {
+			/*
+			 * Breaking off contig_clusters at the front
+			 * of the extent will cover our write.  That's
+			 * easy.
+			 */
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= contig_clusters) {
+			/*
+			 * Breaking off contig_clusters at the tail of
+			 * this extent will cover cpos.
+			 */
+			*cow_start = rec_end - cpos;
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= want_clusters) {
+			/*
+			 * While we can't fit the entire write in this
+			 * extent, we know that the write goes from cpos
+			 * to the end of the extent.  Break that off.
+			 */
+			*cow_start = cpos;
+			*cow_len = rec_end - cpos;
+		} else {
+			/*
+			 * Ok, the entire write lives in the middle of
+			 * this extent.
+			 * Let's try to slice the extentup nicely.
+			 * Optimally, our CoW region starts at a
+			 * multiple of contig_clusters.  If that doesn't
+			 * fit, we give up and just CoW@cpos.
+			 */
+			*cow_start += (cpos - *cow_start) &
+				~(contig_clusters - 1);
+			if ((*cow_start + want_clusters) <
+			    (cpos + write_len))
+				*cow_start = cpos;
+			*cow_len = want_clusters;
+		}
+
+		/* Have we covered our entire write yet? */
+		if ((*cow_start + *cow_len) >= (cpos + write_len))
+			break;
 
 		/*
 		 * If we reach the end of the extent block and don't get enough
-- 

Life's Little Instruction Book #226

	"When someone hugs you, let them be the first to let go."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  3:04         ` Tao Ma
@ 2009-08-21  7:10           ` Joel Becker
  0 siblings, 0 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-21  7:10 UTC (permalink / raw)
  To: ocfs2-devel

On Fri, Aug 21, 2009 at 11:04:11AM +0800, Tao Ma wrote:
> oh, actually I drafted up this code in the hypothesis that write_len
> <= max_clusters, so if we want to write 1.5MB, I am afraid we need
> to rethink it carefully.

	While the write_begin code generally will honor your idea, I
think it's better for ocfs2_refcount_cow() to be able to handle
anything.  Then we can use it wherever.  For example, really large
extended attributes.

Joel

-- 

"To announce that there must be no criticism of them president, or
 that we are to stand by the president, right or wrong, is not only
 unpatriotic and servile, but is morally treasonable to the American
 public."
	- Theodore Roosevelt

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  7:07             ` Joel Becker
@ 2009-08-21  8:24               ` Tao Ma
  2009-08-21 18:39                 ` Joel Becker
  0 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-21  8:24 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joel,
	As our talk in irc, here is the updated one. Please review.

Regards,
Tao

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index cfaf59e..e659dc5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2499,7 +2499,7 @@ out:
 	return ret;
 }
 
-#define	MAX_COW_BYTES	1048576
+#define	MAX_CONTIG_BYTES	1048576
 /*
  * Calculate out the start and number of virtual clusters we need to to CoW.
  *
@@ -2508,9 +2508,8 @@ out:
  * max_cpos is the place where we want to stop CoW intentionally.
  *
  * Normal we will start CoW from the beginning of extent record cotaining cpos.
- * And We will try to Cow as much clusters as we can until we reach
- * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
- * use that value as the maximum clusters.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
  */
 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 					   struct ocfs2_extent_list *el,
@@ -2525,10 +2524,11 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_extent_block *eb = NULL;
 	struct ocfs2_extent_rec *rec;
-	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
+	int want_clusters;
+	int contig_clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb, MAX_CONTIG_BYTES);
 	int leaf_clusters, rec_end = 0;
 
-	max_clusters = max_clusters < write_len ? write_len : max_clusters;
 	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
 		if (ret) {
@@ -2588,53 +2588,95 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
 		}
 
-		if (*cow_len + leaf_clusters >= max_clusters) {
-			if (*cow_len == 0) {
-				/*
-				 * cpos is in a very large extent record.
-				 * So just split max_clusters from the
-				 * extent record.
-				 */
-				if ((rec_end - cpos) <= max_clusters) {
-					/*
-					 * We can take max_clusters off
-					 * the end and cover all of our
-					 * write.
-					 */
-					*cow_start = rec_end - max_clusters;
-				} else if ((*cow_start + max_clusters) >
-					   (cpos + write_len)) {
-					/*
-					 * We can take max_clusters off
-					 * the front and cover all of
-					 * our write.
-					 */
-					/* NOOP, *cow_start is already set */
-				} else {
-					/*
-					 * We're CoWing more data than
-					 * write_len for contiguousness,
-					 * but it doesn't fit at the
-					 * front or end of this extent.
-					 * Let's try to slice the extent
-					 * up nicely.  Optimally, our
-					 * CoW region starts at a
-					 * multiple of max_clusters.  If
-					 * that doesn't fit, we give up
-					 * and just CoW at cpos.
-					 */
-					*cow_start +=
-						(cpos - *cow_start) &
-							~(max_clusters - 1);
-					if ((*cow_start + max_clusters) <
-					    (cpos + write_len))
-						*cow_start = cpos;
-				}
-			}
-			*cow_len = max_clusters;
-			break;
-		} else
+		/*
+		 * How many clusters do we actually need from
+		 * this extent?  First we see how many we actually
+		 * need to complete the write.  If that's smaller
+		 * than contig_clusters, we try for
+		 * contig_clustes.
+		 */
+		if (!*cow_len)
+			want_clusters = write_len;
+		else
+			want_clusters = (cpos + write_len) -
+				(*cow_start + *cow_len);
+		if (want_clusters < contig_clusters)
+			want_clusters = contig_clusters;
+
+		/*
+		 * If the write does not cover the whole extent, we
+		 * need to calculate how we're going to split the extent.
+		 * We try to do it on contig_clusters boundaries.
+		 *
+		 * Any extent smaller than contig_clusters will be
+		 * CoWed in its entirety.
+		 */
+		if (leaf_clusters <= contig_clusters)
 			*cow_len += leaf_clusters;
+		else if (*cow_len || (*cow_start == cpos)) {
+			/*
+			 * This extent needs to be CoW'd from its
+			 * beginning, so all we have to do is compute
+			 * how many clusters to grab.
+			 * We align the want_clusters to the edge of
+			 * contig_clusters to get better I/O.
+			 */
+			want_clusters = (want_clusters + contig_clusters - 1) /
+					contig_clusters * contig_clusters;
+
+			if (leaf_clusters < want_clusters)
+				*cow_len += leaf_clusters;
+			else
+				*cow_len += want_clusters;
+		} else if ((*cow_start + contig_clusters) >=
+			   (cpos + write_len)) {
+			/*
+			 * Breaking off contig_clusters at the front
+			 * of the extent will cover our write.  That's
+			 * easy.
+			 */
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= contig_clusters) {
+			/*
+			 * Breaking off contig_clusters at the tail of
+			 * this extent will cover cpos.
+			 */
+			*cow_start = rec_end - cpos;
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= want_clusters) {
+			/*
+			 * While we can't fit the entire write in this
+			 * extent, we know that the write goes from cpos
+			 * to the end of the extent.  Break that off.
+			 * We try to break it in the aligned range, if not
+			 * CoW the whole extent.
+			 */
+			*cow_start += (cpos - *cow_start) &
+					~(contig_clusters - 1);
+			*cow_len = rec_end - *cow_start;
+		} else {
+			/*
+			 * Ok, the entire write lives in the middle of
+			 * this extent.
+			 * Let's try to slice the extentup nicely.
+			 * Optimally, our CoW region starts@a
+			 * multiple of contig_clusters.
+			 */
+			*cow_start += (cpos - *cow_start) &
+				~(contig_clusters - 1);
+
+			want_clusters = (cpos + write_len) - *cow_start;
+			want_clusters = (want_clusters + contig_clusters - 1) /
+					contig_clusters * contig_clusters;
+			if (*cow_start + want_clusters <= rec_end)
+				*cow_len = want_clusters;
+			else
+				*cow_len = rec_end - *cow_start;
+		}
+
+		/* Have we covered our entire write yet? */
+		if ((*cow_start + *cow_len) >= (cpos + write_len))
+			break;
 
 		/*
 		 * If we reach the end of the extent block and don't get enough

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-21  2:12     ` Tao Ma
@ 2009-08-21 14:55       ` Tao Ma
  2009-08-21 20:43         ` Joel Becker
  0 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-21 14:55 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joel,
	Here is the solution for refcounted+non-refcounted+refcounted issues.
	Please review.

Regards,
Tao

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 02244bb..0978abe 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1677,6 +1677,47 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
 	return ret;
 }
 
+/*
+ * CoW all refcounted clusters within [cow_start, cow_start + cow_len).
+ *
+ * Note: We may meet with non-refcounted clusters or even holes, just skip
+ * them and iterate until we reach cow_start + cow_len.
+ */
+static int ocfs2_refcount_clusters(struct inode *inode,
+				   struct buffer_head *di_bh,
+				   u32 cow_start, u32 cow_len)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	while (cow_len) {
+		ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (cow_len < num_clusters)
+			num_clusters = cow_len;
+
+		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+			ret = ocfs2_refcount_cow(inode, di_bh, cow_start,
+						 num_clusters);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		cow_len -= num_clusters;
+		cow_start += num_clusters;
+	}
+
+	return ret;
+}
+
 int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
@@ -1730,8 +1771,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		BUG_ON(refcounted_cpos == UINT_MAX);
 		cow_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
 
-		ret = ocfs2_refcount_cow(inode, di_bh,
-					 refcounted_cpos, cow_len);
+		ret = ocfs2_refcount_clusters(inode, di_bh,
+					      refcounted_cpos, cow_len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21  8:24               ` Tao Ma
@ 2009-08-21 18:39                 ` Joel Becker
  2009-08-21 20:58                   ` Joel Becker
  2009-08-21 23:07                   ` Tao Ma
  0 siblings, 2 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-21 18:39 UTC (permalink / raw)
  To: ocfs2-devel

On Fri, Aug 21, 2009 at 04:24:25PM +0800, Tao Ma wrote:
> 	As our talk in irc, here is the updated one. Please review.

	Perfect.
	Here's a version with the math wrapped in shiny readable
inlines.  Some comments were updated too.  I didn't actually change the
logic, so please verify I got it right.

Joel

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d59860d..8f0d210 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2499,7 +2499,52 @@ out:
 	return ret;
 }
 
-#define	MAX_COW_BYTES	1048576
+#define	MAX_CONTIG_BYTES	1048576
+
+static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
+{
+	return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
+}
+
+static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
+{
+	return ~(ocfs2_cow_contig_clusters(sb) - 1);
+}
+
+/*
+ * Given an extent that starts at 'start and an I/O that starts at
+ * 'cpos, find an offset (start * (n * contig_clusters)) that is closest
+ * to cpos while still being less than or equal to it.
+ *
+ * The goal is to break the extent at a multiple of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
+						 unsigned int start,
+						 unsigned int cpos)
+{
+	BUG_ON(start > cpos);
+
+	return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
+}
+
+/*
+ * Given a cluster count of len, pad it out so that it is a multiple
+ * of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
+						  unsigned int len)
+{
+	unsigned int padded =
+		(len + (ocfs2_cow_contig_clusters(sb) - 1)) &
+		ocfs2_cow_contig_mask(sb);
+
+	/* Did we wrap? */
+	if (padded < len)
+		padded = UINT_MAX;
+
+	return padded;
+}
+
 /*
  * Calculate out the start and number of virtual clusters we need to to CoW.
  *
@@ -2508,9 +2553,8 @@ out:
  * max_cpos is the place where we want to stop CoW intentionally.
  *
  * Normal we will start CoW from the beginning of extent record cotaining cpos.
- * And We will try to Cow as much clusters as we can until we reach
- * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
- * use that value as the maximum clusters.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
  */
 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 					   struct ocfs2_extent_list *el,
@@ -2525,10 +2569,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_extent_block *eb = NULL;
 	struct ocfs2_extent_rec *rec;
-	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
-	int leaf_clusters, rec_end = 0;
+	unsigned int want_clusters, rec_end = 0;
+	int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
+	int leaf_clusters;
 
-	max_clusters = max_clusters < write_len ? write_len : max_clusters;
 	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
 		if (ret) {
@@ -2587,53 +2631,98 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
 		}
 
-		if (*cow_len + leaf_clusters >= max_clusters) {
-			if (*cow_len == 0) {
-				/*
-				 * cpos is in a very large extent record.
-				 * So just split max_clusters from the
-				 * extent record.
-				 */
-				if ((rec_end - cpos) <= max_clusters) {
-					/*
-					 * We can take max_clusters off
-					 * the end and cover all of our
-					 * write.
-					 */
-					*cow_start = rec_end - max_clusters;
-				} else if ((*cow_start + max_clusters) >
-					   (cpos + write_len)) {
-					/*
-					 * We can take max_clusters off
-					 * the front and cover all of
-					 * our write.
-					 */
-					/* NOOP, *cow_start is already set */
-				} else {
-					/*
-					 * We're CoWing more data than
-					 * write_len for contiguousness,
-					 * but it doesn't fit at the
-					 * front or end of this extent.
-					 * Let's try to slice the extent
-					 * up nicely.  Optimally, our
-					 * CoW region starts at a
-					 * multiple of max_clusters.  If
-					 * that doesn't fit, we give up
-					 * and just CoW at cpos.
-					 */
-					*cow_start +=
-						(cpos - *cow_start) &
-							~(max_clusters - 1);
-					if ((*cow_start + max_clusters) <
-					    (cpos + write_len))
-						*cow_start = cpos;
-				}
-			}
-			*cow_len = max_clusters;
-			break;
-		} else
+		/*
+		 * How many clusters do we actually need from
+		 * this extent?  First we see how many we actually
+		 * need to complete the write.  If that's smaller
+		 * than contig_clusters, we try for contig_clusters.
+		 */
+		if (!*cow_len)
+			want_clusters = write_len;
+		else
+			want_clusters = (cpos + write_len) -
+				(*cow_start + *cow_len);
+		if (want_clusters < contig_clusters)
+			want_clusters = contig_clusters;
+
+		/*
+		 * If the write does not cover the whole extent, we
+		 * need to calculate how we're going to split the extent.
+		 * We try to do it on contig_clusters boundaries.
+		 *
+		 * Any extent smaller than contig_clusters will be
+		 * CoWed in its entirety.
+		 */
+		if (leaf_clusters <= contig_clusters)
 			*cow_len += leaf_clusters;
+		else if (*cow_len || (*cow_start == cpos)) {
+			/*
+			 * This extent needs to be CoW'd from its
+			 * beginning, so all we have to do is compute
+			 * how many clusters to grab.  We align
+			 * want_clusters to the edge of contig_clusters
+			 * to get better I/O.
+			 */
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+
+			if (leaf_clusters < want_clusters)
+				*cow_len += leaf_clusters;
+			else
+				*cow_len += want_clusters;
+		} else if ((*cow_start + contig_clusters) >=
+			   (cpos + write_len)) {
+			/*
+			 * Breaking off contig_clusters at the front
+			 * of the extent will cover our write.  That's
+			 * easy.
+			 */
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= contig_clusters) {
+			/*
+			 * Breaking off contig_clusters at the tail of
+			 * this extent will cover cpos.
+			 */
+			*cow_start = rec_end - cpos;
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= want_clusters) {
+			/*
+			 * While we can't fit the entire write in this
+			 * extent, we know that the write goes from cpos
+			 * to the end of the extent.  Break that off.
+			 * We try to break it@some multiple of
+			 * contig_clusters from the front of the extent.
+			 * Failing that (ie, cpos is within
+			 * contig_clusters of the front), we'll CoW the
+			 * entire extent.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+			*cow_len = rec_end - *cow_start;
+		} else {
+			/*
+			 * Ok, the entire write lives in the middle of
+			 * this extent.  Let's try to slice the extent up
+			 * nicely.  Optimally, our CoW region starts at
+			 * m*contig_clusters from the beginning of the
+			 * extent and goes for n*contig_clusters,
+			 * covering the entire write.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+
+			want_clusters = (cpos + write_len) - *cow_start;
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+			if (*cow_start + want_clusters <= rec_end)
+				*cow_len = want_clusters;
+			else
+				*cow_len = rec_end - *cow_start;
+		}
+
+		/* Have we covered our entire write yet? */
+		if ((*cow_start + *cow_len) >= (cpos + write_len))
+			break;
 
 		/*
 		 * If we reach the end of the extent block and don't get enough
-- 

"Any man who is under 30, and is not a liberal, has not heart;
 and any man who is over 30, and is not a conservative, has no brains."
         - Sir Winston Churchill 

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-21 14:55       ` Tao Ma
@ 2009-08-21 20:43         ` Joel Becker
  0 siblings, 0 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-21 20:43 UTC (permalink / raw)
  To: ocfs2-devel

On Fri, Aug 21, 2009 at 10:55:52PM +0800, Tao Ma wrote:
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index 02244bb..0978abe 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -1677,6 +1677,47 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
>  	return ret;
>  }
>  
> +/*
> + * CoW all refcounted clusters within [cow_start, cow_start + cow_len).
> + *
> + * Note: We may meet with non-refcounted clusters or even holes, just skip
> + * them and iterate until we reach cow_start + cow_len.
> + */
> +static int ocfs2_refcount_clusters(struct inode *inode,
> +				   struct buffer_head *di_bh,
> +				   u32 cow_start, u32 cow_len)
> +{
> +	int ret = 0;
> +	u32 p_cluster, num_clusters;
> +	unsigned int ext_flags;
> +
> +	while (cow_len) {
> +		ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
> +					 &num_clusters, &ext_flags);
> +		if (ret) {
> +			mlog_errno(ret);
> +			break;
> +		}
> +
> +		if (cow_len < num_clusters)
> +			num_clusters = cow_len;
> +
> +		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
> +			ret = ocfs2_refcount_cow(inode, di_bh, cow_start,
> +						 num_clusters);
> +			if (ret) {
> +				mlog_errno(ret);
> +				break;
> +			}
> +		}
> +
> +		cow_len -= num_clusters;
> +		cow_start += num_clusters;
> +	}
> +
> +	return ret;
> +}
> +

	This belongs in refcounttree.c and should be called
ocfs2_refcount_cow().  It should have the exact same prototype as
ocfs2_refcount_cow() does now.  Inside this function the length should
be called write_len, not cow_len, because it's the length of the write
we're trying, not the length of the actual CoW.   Rename
ocfs2_refcount_cow() to ocfs2_refcount_cow_hunk():

	/*
	 * Starting at cpos, try to CoW write_len clusters.  Don't CoW
	 * past max_cpos.  This will stop when it runs into a hole or an
	 * unrefcounted extent.
	 */
	int ocfs2_refcount_cow_hunk(struct inode *inode,
				    struct buffer_head *di_bh,
				    u32 cpos, u32 write_len, u32 max_cpos);

	/*
	 * CoW any and all clusters between cpos and cpos+write_len.
	 * Don't CoW past max_cpos.  If this returns successfully, all
	 * clusters between cpos and cpos+write_len are safe to modify.
	 */
	int ocfs2_refcount_cow(struct inode *inode,
			       struct buffer_head *di_bh,
			       u32 cpos, u32 write_len, u32 max_cpos);

	The idea is that any part of the ocfs2 code should be able to
call ocfs2_refcount_cow() for any cpos+write_len and be assured that the
clusters are writable when the function returns.

Joel

-- 

"The cynics are right nine times out of ten."  
        - H. L. Mencken

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21 18:39                 ` Joel Becker
@ 2009-08-21 20:58                   ` Joel Becker
  2009-08-24 15:04                     ` Tao Ma
  2009-08-21 23:07                   ` Tao Ma
  1 sibling, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-21 20:58 UTC (permalink / raw)
  To: ocfs2-devel

On Fri, Aug 21, 2009 at 11:39:44AM -0700, Joel Becker wrote:
> On Fri, Aug 21, 2009 at 04:24:25PM +0800, Tao Ma wrote:
> > 	As our talk in irc, here is the updated one. Please review.
> 
> 	Perfect.
> 	Here's a version with the math wrapped in shiny readable
> inlines.  Some comments were updated too.  I didn't actually change the
> logic, so please verify I got it right.
> 
> Joel
> 
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index d59860d..8f0d210 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -2499,7 +2499,52 @@ out:
>  	return ret;
>  }
>  
> -#define	MAX_COW_BYTES	1048576
> +#define	MAX_CONTIG_BYTES	1048576
> +
> +static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
> +{
> +	return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
> +}
> +
> +static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
> +{
> +	return ~(ocfs2_cow_contig_clusters(sb) - 1);
> +}
> +
> +/*
> + * Given an extent that starts at 'start and an I/O that starts at
> + * 'cpos, find an offset (start * (n * contig_clusters)) that is closest

	Erg, "(start + (n * contig_clusters))".

Joel

-- 

"The only way to get rid of a temptation is to yield to it."
         - Oscar Wilde 

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write Tao Ma
  2009-08-21  1:04   ` Joel Becker
@ 2009-08-21 21:12   ` Joel Becker
  2009-08-21 23:17     ` Tao Ma
  1 sibling, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-21 21:12 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 18, 2009 at 02:19:20PM +0800, Tao Ma wrote:
> +	if (ret == -ETXTBSY) {
> +		BUG_ON(refcounted_cpos == UINT_MAX);
> +		cow_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
> +
> +		ret = ocfs2_refcount_cow(inode, di_bh,
> +					 refcounted_cpos, cow_len);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}

	I've just realized two more problems.  Well, one is a bug;
the other is merely inefficient.
	First, the inefficiency.  We've cooked up an
ocfs2_refcount_cow() that can handle any cpos+write_len.  But we call it
from ocfs2_write_begin_nolock(), which only goes a page at a time.  So
even for a 1GB write, we're going to CoW 1MB at a time.  For the first
page of the I/O, we'll call ocfs2_refcount_cow().  This will try to CoW
just the page.  We'll pad that out to 1MB in cal_cow_clusters().  For
the next few pages up to 1MB of I/O it will see the now-CoWed clusters.
But then it gets to the first page of the second MB.  It will CoW the
second MB, and so on.  We've just split the 1GB range into 1MB hunks on
disk.
	Now, we have to check REFCOUNTED in write_begin() (well,
populate_write_desc()) because that's how we trap mmap().  So we leave
it here.  But for a regular write, we know the entire length up in
ocfs2_file_aio_write().  So in ocfs2_prepare_inode_for_write(), right
before the direct_io checks, why don't we just CoW the entire write
there?  Create a check_for_refcount just like check_for_holes, except
instead of filling holes you CoW.  The function can easily skip out if
there's no refcount tree on the inode.  This gives us large CoW regions.
We're going to have to do the CoW anyway.  When a regular write gets
into populate_write_desc(), it won't find any refcounted records, so
there's no more work at that level.
	Even better, this fixes the bug.  What's the bug?  The current
code doesn't CoW O_DIRECT writes!  We only check in prepare_write_desc,
which we don't use for O_DIRECT!  And ocfs2_direct_IO_get_blocks()
doesn't trigger buffered fallback either!  Well, we don't want buffered
fallback.  We want CoW followed by real O_DIRECT.  ANd if we do the CoW
up in prepare_inode_for_write(), we get it.  Plus, we can put a
BUG_ON(ext_flags & REFCOUNTED) in direct_IO_get_blocks().

Joel

-- 

"There is no more evil thing on earth than race prejudice, none at 
 all.  I write deliberately -- it is the worst single thing in life 
 now.  It justifies and holds together more baseness, cruelty and
 abomination than any other sort of error in the world." 
        - H. G. Wells

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21 18:39                 ` Joel Becker
  2009-08-21 20:58                   ` Joel Becker
@ 2009-08-21 23:07                   ` Tao Ma
  1 sibling, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-21 23:07 UTC (permalink / raw)
  To: ocfs2-devel

yeah, it is more readable with more comments.
I will integrate it into my original patch. thanks.

Regards,
Tao

Joel Becker wrote:
> On Fri, Aug 21, 2009 at 04:24:25PM +0800, Tao Ma wrote:
>> 	As our talk in irc, here is the updated one. Please review.
> 
> 	Perfect.
> 	Here's a version with the math wrapped in shiny readable
> inlines.  Some comments were updated too.  I didn't actually change the
> logic, so please verify I got it right.
> 
> Joel
> 
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index d59860d..8f0d210 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -2499,7 +2499,52 @@ out:
>  	return ret;
>  }
>  
> -#define	MAX_COW_BYTES	1048576
> +#define	MAX_CONTIG_BYTES	1048576
> +
> +static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
> +{
> +	return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
> +}
> +
> +static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
> +{
> +	return ~(ocfs2_cow_contig_clusters(sb) - 1);
> +}
> +
> +/*
> + * Given an extent that starts at 'start and an I/O that starts at
> + * 'cpos, find an offset (start * (n * contig_clusters)) that is closest
> + * to cpos while still being less than or equal to it.
> + *
> + * The goal is to break the extent at a multiple of contig_clusters.
> + */
> +static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
> +						 unsigned int start,
> +						 unsigned int cpos)
> +{
> +	BUG_ON(start > cpos);
> +
> +	return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
> +}
> +
> +/*
> + * Given a cluster count of len, pad it out so that it is a multiple
> + * of contig_clusters.
> + */
> +static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
> +						  unsigned int len)
> +{
> +	unsigned int padded =
> +		(len + (ocfs2_cow_contig_clusters(sb) - 1)) &
> +		ocfs2_cow_contig_mask(sb);
> +
> +	/* Did we wrap? */
> +	if (padded < len)
> +		padded = UINT_MAX;
> +
> +	return padded;
> +}
> +
>  /*
>   * Calculate out the start and number of virtual clusters we need to to CoW.
>   *
> @@ -2508,9 +2553,8 @@ out:
>   * max_cpos is the place where we want to stop CoW intentionally.
>   *
>   * Normal we will start CoW from the beginning of extent record cotaining cpos.
> - * And We will try to Cow as much clusters as we can until we reach
> - * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
> - * use that value as the maximum clusters.
> + * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
> + * get good I/O from the resulting extent tree.
>   */
>  static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
>  					   struct ocfs2_extent_list *el,
> @@ -2525,10 +2569,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
>  	struct buffer_head *eb_bh = NULL;
>  	struct ocfs2_extent_block *eb = NULL;
>  	struct ocfs2_extent_rec *rec;
> -	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
> -	int leaf_clusters, rec_end = 0;
> +	unsigned int want_clusters, rec_end = 0;
> +	int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
> +	int leaf_clusters;
>  
> -	max_clusters = max_clusters < write_len ? write_len : max_clusters;
>  	if (tree_height > 0) {
>  		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
>  		if (ret) {
> @@ -2587,53 +2631,98 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
>  			leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
>  		}
>  
> -		if (*cow_len + leaf_clusters >= max_clusters) {
> -			if (*cow_len == 0) {
> -				/*
> -				 * cpos is in a very large extent record.
> -				 * So just split max_clusters from the
> -				 * extent record.
> -				 */
> -				if ((rec_end - cpos) <= max_clusters) {
> -					/*
> -					 * We can take max_clusters off
> -					 * the end and cover all of our
> -					 * write.
> -					 */
> -					*cow_start = rec_end - max_clusters;
> -				} else if ((*cow_start + max_clusters) >
> -					   (cpos + write_len)) {
> -					/*
> -					 * We can take max_clusters off
> -					 * the front and cover all of
> -					 * our write.
> -					 */
> -					/* NOOP, *cow_start is already set */
> -				} else {
> -					/*
> -					 * We're CoWing more data than
> -					 * write_len for contiguousness,
> -					 * but it doesn't fit at the
> -					 * front or end of this extent.
> -					 * Let's try to slice the extent
> -					 * up nicely.  Optimally, our
> -					 * CoW region starts at a
> -					 * multiple of max_clusters.  If
> -					 * that doesn't fit, we give up
> -					 * and just CoW at cpos.
> -					 */
> -					*cow_start +=
> -						(cpos - *cow_start) &
> -							~(max_clusters - 1);
> -					if ((*cow_start + max_clusters) <
> -					    (cpos + write_len))
> -						*cow_start = cpos;
> -				}
> -			}
> -			*cow_len = max_clusters;
> -			break;
> -		} else
> +		/*
> +		 * How many clusters do we actually need from
> +		 * this extent?  First we see how many we actually
> +		 * need to complete the write.  If that's smaller
> +		 * than contig_clusters, we try for contig_clusters.
> +		 */
> +		if (!*cow_len)
> +			want_clusters = write_len;
> +		else
> +			want_clusters = (cpos + write_len) -
> +				(*cow_start + *cow_len);
> +		if (want_clusters < contig_clusters)
> +			want_clusters = contig_clusters;
> +
> +		/*
> +		 * If the write does not cover the whole extent, we
> +		 * need to calculate how we're going to split the extent.
> +		 * We try to do it on contig_clusters boundaries.
> +		 *
> +		 * Any extent smaller than contig_clusters will be
> +		 * CoWed in its entirety.
> +		 */
> +		if (leaf_clusters <= contig_clusters)
>  			*cow_len += leaf_clusters;
> +		else if (*cow_len || (*cow_start == cpos)) {
> +			/*
> +			 * This extent needs to be CoW'd from its
> +			 * beginning, so all we have to do is compute
> +			 * how many clusters to grab.  We align
> +			 * want_clusters to the edge of contig_clusters
> +			 * to get better I/O.
> +			 */
> +			want_clusters = ocfs2_cow_align_length(inode->i_sb,
> +							       want_clusters);
> +
> +			if (leaf_clusters < want_clusters)
> +				*cow_len += leaf_clusters;
> +			else
> +				*cow_len += want_clusters;
> +		} else if ((*cow_start + contig_clusters) >=
> +			   (cpos + write_len)) {
> +			/*
> +			 * Breaking off contig_clusters at the front
> +			 * of the extent will cover our write.  That's
> +			 * easy.
> +			 */
> +			*cow_len = contig_clusters;
> +		} else if ((rec_end - cpos) <= contig_clusters) {
> +			/*
> +			 * Breaking off contig_clusters at the tail of
> +			 * this extent will cover cpos.
> +			 */
> +			*cow_start = rec_end - cpos;
> +			*cow_len = contig_clusters;
> +		} else if ((rec_end - cpos) <= want_clusters) {
> +			/*
> +			 * While we can't fit the entire write in this
> +			 * extent, we know that the write goes from cpos
> +			 * to the end of the extent.  Break that off.
> +			 * We try to break it at some multiple of
> +			 * contig_clusters from the front of the extent.
> +			 * Failing that (ie, cpos is within
> +			 * contig_clusters of the front), we'll CoW the
> +			 * entire extent.
> +			 */
> +			*cow_start = ocfs2_cow_align_start(inode->i_sb,
> +							   *cow_start, cpos);
> +			*cow_len = rec_end - *cow_start;
> +		} else {
> +			/*
> +			 * Ok, the entire write lives in the middle of
> +			 * this extent.  Let's try to slice the extent up
> +			 * nicely.  Optimally, our CoW region starts at
> +			 * m*contig_clusters from the beginning of the
> +			 * extent and goes for n*contig_clusters,
> +			 * covering the entire write.
> +			 */
> +			*cow_start = ocfs2_cow_align_start(inode->i_sb,
> +							   *cow_start, cpos);
> +
> +			want_clusters = (cpos + write_len) - *cow_start;
> +			want_clusters = ocfs2_cow_align_length(inode->i_sb,
> +							       want_clusters);
> +			if (*cow_start + want_clusters <= rec_end)
> +				*cow_len = want_clusters;
> +			else
> +				*cow_len = rec_end - *cow_start;
> +		}
> +
> +		/* Have we covered our entire write yet? */
> +		if ((*cow_start + *cow_len) >= (cpos + write_len))
> +			break;
>  
>  		/*
>  		 * If we reach the end of the extent block and don't get enough

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-21 21:12   ` Joel Becker
@ 2009-08-21 23:17     ` Tao Ma
  2009-08-21 23:42       ` Joel Becker
  0 siblings, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-21 23:17 UTC (permalink / raw)
  To: ocfs2-devel



Joel Becker wrote:
> On Tue, Aug 18, 2009 at 02:19:20PM +0800, Tao Ma wrote:
>> +	if (ret == -ETXTBSY) {
>> +		BUG_ON(refcounted_cpos == UINT_MAX);
>> +		cow_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
>> +
>> +		ret = ocfs2_refcount_cow(inode, di_bh,
>> +					 refcounted_cpos, cow_len);
>> +		if (ret) {
>> +			mlog_errno(ret);
>> +			goto out;
>> +		}
> 
> 	I've just realized two more problems.  Well, one is a bug;
> the other is merely inefficient.
> 	First, the inefficiency.  We've cooked up an
> ocfs2_refcount_cow() that can handle any cpos+write_len.  But we call it
> from ocfs2_write_begin_nolock(), which only goes a page at a time.  So
> even for a 1GB write, we're going to CoW 1MB at a time.  For the first
> page of the I/O, we'll call ocfs2_refcount_cow().  This will try to CoW
> just the page.  We'll pad that out to 1MB in cal_cow_clusters().  For
> the next few pages up to 1MB of I/O it will see the now-CoWed clusters.
> But then it gets to the first page of the second MB.  It will CoW the
> second MB, and so on.  We've just split the 1GB range into 1MB hunks on
> disk.
yes, that is anticipated. We CoW 1MB at most at a time.
> 	Now, we have to check REFCOUNTED in write_begin() (well,
> populate_write_desc()) because that's how we trap mmap().  So we leave
> it here.  But for a regular write, we know the entire length up in
> ocfs2_file_aio_write().  So in ocfs2_prepare_inode_for_write(), right
> before the direct_io checks, why don't we just CoW the entire write
> there?  Create a check_for_refcount just like check_for_holes, except
> instead of filling holes you CoW.  The function can easily skip out if
> there's no refcount tree on the inode.  This gives us large CoW regions.
> We're going to have to do the CoW anyway.  When a regular write gets
> into populate_write_desc(), it won't find any refcounted records, so
> there's no more work at that level.
yes, we can put a check there, but we can't resolve the 1MB issue you 
mentioned above either. Maybe we can make ocfs2_refcount_cow more 
intelligent? But I would say let us leave it as-is and this can be a 
future improvement.
> 	Even better, this fixes the bug.  What's the bug?  The current
> code doesn't CoW O_DIRECT writes!  We only check in prepare_write_desc,
> which we don't use for O_DIRECT!  And ocfs2_direct_IO_get_blocks()
> doesn't trigger buffered fallback either!  Well, we don't want buffered
> fallback.  We want CoW followed by real O_DIRECT.  ANd if we do the CoW
> up in prepare_inode_for_write(), we get it.  Plus, we can put a
> BUG_ON(ext_flags & REFCOUNTED) in direct_IO_get_blocks().
oh, yes, this is really a bug. I don't think of O_DIRECT when I created 
this patch set. So I may really need to add a check in 
ocfs2_prepare_inode_for_write(I guess just need to call 
ocfs2_refcount_cow and make write_len<=1MB).

This also make me think that we can cal ocfs2_refcount_cow right before 
we populate_write_desc, so that we don't need to call it twice and we 
can directly BUG_ON(ext_flags & REFCOUNTED) in it.

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-21 23:17     ` Tao Ma
@ 2009-08-21 23:42       ` Joel Becker
  2009-08-22  0:31         ` Tao Ma
  2009-08-24 15:06         ` Tao Ma
  0 siblings, 2 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-21 23:42 UTC (permalink / raw)
  To: ocfs2-devel

On Sat, Aug 22, 2009 at 07:17:55AM +0800, Tao Ma wrote:
> Joel Becker wrote:
> yes, that is anticipated. We CoW 1MB at most at a time.

	No!  We want to CoW the entire write_len at once!  That's why I
wrote cal_cow_cluters() the way I did!

> yes, we can put a check there, but we can't resolve the 1MB issue
> you mentioned above either. Maybe we can make ocfs2_refcount_cow
> more intelligent? But I would say let us leave it as-is and this can
> be a future improvement.

	If you make ocfs2_refcount_cow() into ocfs2_refcount_cow_hunk()
like I described in my email, make the your ocfs2_refcount_cluters()
into a new ocfs2_refcount_cow() like I described, then we can CoW the
entire write at once.

> oh, yes, this is really a bug. I don't think of O_DIRECT when I
> created this patch set. So I may really need to add a check in
> ocfs2_prepare_inode_for_write(I guess just need to call
> ocfs2_refcount_cow and make write_len<=1MB).

	No, you pass a write_len of the full I/O!  We don't want to
fragment the thing, and there's no reason to :-)

> This also make me think that we can cal ocfs2_refcount_cow right
> before we populate_write_desc, so that we don't need to call it
> twice and we can directly BUG_ON(ext_flags & REFCOUNTED) in it.

	With the new ocfs2_refcount_cow() that loops over the entire
write, absolutely.

Joel

-- 

"Every day I get up and look through the Forbes list of the richest
 people in America. If I'm not there, I go to work."
        - Robert Orben

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-21 23:42       ` Joel Becker
@ 2009-08-22  0:31         ` Tao Ma
  2009-08-24 15:06         ` Tao Ma
  1 sibling, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-22  0:31 UTC (permalink / raw)
  To: ocfs2-devel



Joel Becker wrote:
> On Sat, Aug 22, 2009 at 07:17:55AM +0800, Tao Ma wrote:
>   
>> Joel Becker wrote:
>> yes, that is anticipated. We CoW 1MB at most at a time.
>>     
>
> 	No!  We want to CoW the entire write_len at once!  That's why I
> wrote cal_cow_cluters() the way I did!
>   
oh, yes. My brain is stuck in the old "MAX_COW_BYTES".  now we have 
"MAX_CONTIG_BYTES".
>   
>> yes, we can put a check there, but we can't resolve the 1MB issue
>> you mentioned above either. Maybe we can make ocfs2_refcount_cow
>> more intelligent? But I would say let us leave it as-is and this can
>> be a future improvement.
>>     
>
> 	If you make ocfs2_refcount_cow() into ocfs2_refcount_cow_hunk()
> like I described in my email, make the your ocfs2_refcount_cluters()
> into a new ocfs2_refcount_cow() like I described, then we can CoW the
> entire write at once.
>
>   
>> oh, yes, this is really a bug. I don't think of O_DIRECT when I
>> created this patch set. So I may really need to add a check in
>> ocfs2_prepare_inode_for_write(I guess just need to call
>> ocfs2_refcount_cow and make write_len<=1MB).
>>     
>
> 	No, you pass a write_len of the full I/O!  We don't want to
> fragment the thing, and there's no reason to :-)
>   
yes.
>   
>> This also make me think that we can cal ocfs2_refcount_cow right
>> before we populate_write_desc, so that we don't need to call it
>> twice and we can directly BUG_ON(ext_flags & REFCOUNTED) in it.
>>     
>
> 	With the new ocfs2_refcount_cow() that loops over the entire
> write, absolutely.
>   
sure.

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-21 20:58                   ` Joel Becker
@ 2009-08-24 15:04                     ` Tao Ma
  2009-08-24 18:20                       ` Joel Becker
  2009-08-25 19:30                       ` Joel Becker
  0 siblings, 2 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-24 15:04 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joel,
	This is the updated patch. Please review.

Regards,
Tao

This patch try CoW support for a refcounted record.

the whole process will be:
1. Calculate how much clusters we need to CoW and where we start.
   Currently we will CoW 1MB at most.
2. Do CoW for the clusters with the help of page cache.
3. Change the b-tree structure with the new allocated clusters.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/alloc.c        |   25 +-
 fs/ocfs2/alloc.h        |    5 +
 fs/ocfs2/aops.c         |    4 +-
 fs/ocfs2/aops.h         |    2 +
 fs/ocfs2/refcounttree.c |  814 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    2 +
 6 files changed, 841 insertions(+), 11 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 03438a6..b8fc95d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6998,9 +6998,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
-				     unsigned int from, unsigned int to,
-				     struct page *page, int zero, u64 *phys)
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys)
 {
 	int ret, partial = 0;
 
@@ -7068,20 +7068,16 @@ out:
 		ocfs2_unlock_and_free_pages(pages, numpages);
 }
 
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
-				struct page **pages, int *num)
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num)
 {
 	int numpages, ret = 0;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
 	loff_t last_page_bytes;
 
 	BUG_ON(start > end);
 
-	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
-	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
-
 	numpages = 0;
 	last_page_bytes = PAGE_ALIGN(end);
 	index = start >> PAGE_CACHE_SHIFT;
@@ -7109,6 +7105,17 @@ out:
 	return ret;
 }
 
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+				struct page **pages, int *num)
+{
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+	return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
 /*
  * Zero the area past i_size but still within an allocated
  * cluster. This avoids exposing nonzero data on subsequent file
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 19d5b88..9c122d5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -271,6 +271,11 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 	return !rec->e_leaf_clusters;
 }
 
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys);
 /*
  * Structures which describe a path through a btree, and functions to
  * manipulate them.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7208ea9..65e17b3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -126,8 +126,8 @@ bail:
 	return err;
 }
 
-static int ocfs2_get_block(struct inode *inode, sector_t iblock,
-			   struct buffer_head *bh_result, int create)
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
 {
 	int err = 0;
 	unsigned int ext_flags;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e492..c48e93f 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 			   struct buffer_head *di_bh);
 int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
 
+int ocfs2_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create);
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index e72dbdd..4e7df8b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -31,6 +31,27 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "extent_map.h"
+#include "aops.h"
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+struct ocfs2_cow_context {
+	struct inode *inode;
+	u32 cow_start;
+	u32 cow_len;
+	struct ocfs2_extent_tree di_et;
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+};
 
 static inline struct ocfs2_refcount_tree *
 cache_info_to_refcount(struct ocfs2_caching_info *ci)
@@ -2404,3 +2425,796 @@ out:
 	brelse(ref_root_bh);
 	return ret;
 }
+
+#define	MAX_CONTIG_BYTES	1048576
+
+static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
+{
+	return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
+}
+
+static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
+{
+	return ~(ocfs2_cow_contig_clusters(sb) - 1);
+}
+
+/*
+ * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
+ * find an offset (start + (n * contig_clusters)) that is closest to cpos
+ * while still being less than or equal to it.
+ *
+ * The goal is to break the extent at a multiple of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
+						 unsigned int start,
+						 unsigned int cpos)
+{
+	BUG_ON(start > cpos);
+
+	return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
+}
+
+/*
+ * Given a cluster count of len, pad it out so that it is a multiple
+ * of contig_clusters.
+ */
+static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
+						  unsigned int len)
+{
+	unsigned int padded =
+		(len + (ocfs2_cow_contig_clusters(sb) - 1)) &
+		ocfs2_cow_contig_mask(sb);
+
+	/* Did we wrap? */
+	if (padded < len)
+		padded = UINT_MAX;
+
+	return padded;
+}
+
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
+ * get good I/O from the resulting extent tree.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+					   struct buffer_head *di_bh,
+					   u32 cpos,
+					   u32 write_len,
+					   u32 *cow_start,
+					   u32 *cow_len)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+	int tree_height = le16_to_cpu(el->l_tree_depth), i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct ocfs2_extent_rec *rec;
+	unsigned int want_clusters, rec_end = 0;
+	int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
+	int leaf_clusters;
+
+	if (tree_height > 0) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	*cow_len = 0;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
+
+		if (ocfs2_is_empty_extent(rec)) {
+			mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+					"index %d\n", inode->i_ino, i);
+			continue;
+		}
+
+		if (le32_to_cpu(rec->e_cpos) +
+		    le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+			continue;
+
+		if (*cow_len == 0) {
+			/*
+			 * We should find a refcounted record in the
+			 * first pass.
+			 */
+			BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+			*cow_start = le32_to_cpu(rec->e_cpos);
+		}
+
+		/*
+		 * If we encounter a hole or a non-refcounted record,
+		 * stop the search.
+		 */
+		if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+		    (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)))
+			break;
+
+		leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+		rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+
+		/*
+		 * How many clusters do we actually need from
+		 * this extent?  First we see how many we actually
+		 * need to complete the write.  If that's smaller
+		 * than contig_clusters, we try for contig_clusters.
+		 */
+		if (!*cow_len)
+			want_clusters = write_len;
+		else
+			want_clusters = (cpos + write_len) -
+				(*cow_start + *cow_len);
+		if (want_clusters < contig_clusters)
+			want_clusters = contig_clusters;
+
+		/*
+		 * If the write does not cover the whole extent, we
+		 * need to calculate how we're going to split the extent.
+		 * We try to do it on contig_clusters boundaries.
+		 *
+		 * Any extent smaller than contig_clusters will be
+		 * CoWed in its entirety.
+		 */
+		if (leaf_clusters <= contig_clusters)
+			*cow_len += leaf_clusters;
+		else if (*cow_len || (*cow_start == cpos)) {
+			/*
+			 * This extent needs to be CoW'd from its
+			 * beginning, so all we have to do is compute
+			 * how many clusters to grab.  We align
+			 * want_clusters to the edge of contig_clusters
+			 * to get better I/O.
+			 */
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+
+			if (leaf_clusters < want_clusters)
+				*cow_len += leaf_clusters;
+			else
+				*cow_len += want_clusters;
+		} else if ((*cow_start + contig_clusters) >=
+			   (cpos + write_len)) {
+			/*
+			 * Breaking off contig_clusters at the front
+			 * of the extent will cover our write.  That's
+			 * easy.
+			 */
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= contig_clusters) {
+			/*
+			 * Breaking off contig_clusters at the tail of
+			 * this extent will cover cpos.
+			 */
+			*cow_start = rec_end - contig_clusters;
+			*cow_len = contig_clusters;
+		} else if ((rec_end - cpos) <= want_clusters) {
+			/*
+			 * While we can't fit the entire write in this
+			 * extent, we know that the write goes from cpos
+			 * to the end of the extent.  Break that off.
+			 * We try to break it@some multiple of
+			 * contig_clusters from the front of the extent.
+			 * Failing that (ie, cpos is within
+			 * contig_clusters of the front), we'll CoW the
+			 * entire extent.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+			*cow_len = rec_end - *cow_start;
+		} else {
+			/*
+			 * Ok, the entire write lives in the middle of
+			 * this extent.  Let's try to slice the extent up
+			 * nicely.  Optimally, our CoW region starts at
+			 * m*contig_clusters from the beginning of the
+			 * extent and goes for n*contig_clusters,
+			 * covering the entire write.
+			 */
+			*cow_start = ocfs2_cow_align_start(inode->i_sb,
+							   *cow_start, cpos);
+
+			want_clusters = (cpos + write_len) - *cow_start;
+			want_clusters = ocfs2_cow_align_length(inode->i_sb,
+							       want_clusters);
+			if (*cow_start + want_clusters <= rec_end)
+				*cow_len = want_clusters;
+			else
+				*cow_len = rec_end - *cow_start;
+		}
+
+		/* Have we covered our entire write yet? */
+		if ((*cow_start + *cow_len) >= (cpos + write_len))
+			break;
+
+		/*
+		 * If we reach the end of the extent block and don't get enough
+		 * clusters, continue with the next extent block if possible.
+		 */
+		if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
+		    eb && eb->h_next_leaf_blk) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+
+			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+					       le64_to_cpu(eb->h_next_leaf_blk),
+					       &eb_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+			el = &eb->h_list;
+			i = -1;
+		}
+	}
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ *    more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ *    just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+					u32 p_cluster, u32 num_clusters,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int *credits)
+{
+	int ret = 0, meta_add = 0;
+	int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < num_clusters + 2)
+		meta_add =
+			ocfs2_extend_meta_needed(et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
+					      num_clusters + 2);
+
+	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
+	     meta_add, num_clusters, *credits);
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+						meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+					     data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUG_ON(buffer_dirty(bh));
+
+	clear_buffer_mapped(bh);
+
+	return 0;
+}
+
+static int ocfs2_duplicate_clusters(handle_t *handle,
+				    struct ocfs2_cow_context *context,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len)
+{
+	int ret = 0, partial;
+	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct page *page;
+	pgoff_t page_index;
+	unsigned int from, to;
+	loff_t offset, end, map_end;
+	struct address_space *mapping = context->inode->i_mapping;
+
+	mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
+	     new_cluster, new_len, cpos);
+
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		/* from, to is the offset within the page. */
+		from = offset & (PAGE_CACHE_SIZE - 1);
+		to = PAGE_CACHE_SIZE;
+		if (map_end & (PAGE_CACHE_SIZE - 1))
+			to = map_end & (PAGE_CACHE_SIZE - 1);
+
+		page = grab_cache_page(mapping, page_index);
+
+		/* This page can't be dirtied before we CoW it out. */
+		BUG_ON(PageDirty(page));
+
+		if (!PageUptodate(page)) {
+			ret = block_read_full_page(page, ocfs2_get_block);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+			lock_page(page);
+		}
+
+		if (page_has_buffers(page)) {
+			ret = walk_page_buffers(handle, page_buffers(page),
+						from, to, &partial,
+						ocfs2_clear_cow_buffer);
+			if (ret) {
+				mlog_errno(ret);
+				goto unlock;
+			}
+		}
+
+		ocfs2_map_and_dirty_page(context->inode,
+					 handle, from, to,
+					 page, 0, &new_block);
+		mark_page_accessed(page);
+unlock:
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, u32 p_cluster, u32 len,
+				    unsigned int ext_flags,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	struct ocfs2_extent_rec replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+	mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
+	     (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+								   p_cluster));
+	replace_rec.e_flags = ext_flags;
+	replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(et->et_ci, path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, et, path, index,
+				 &replace_rec, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+				  struct ocfs2_cow_context *context,
+				  u32 cpos, u32 old,
+				  u32 new, u32 len,
+				  unsigned int ext_flags)
+{
+	int ret;
+	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	u64 ino = ocfs2_metadata_cache_owner(ci);
+
+	mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
+	     (unsigned long long)ino, cpos, old, new, len, ext_flags);
+
+	/*If the old clusters is unwritten, no need to duplicate. */
+	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+		ret = ocfs2_duplicate_clusters(handle, context, cpos,
+					       old, new, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+				       cpos, new, len, ext_flags,
+				       context->meta_ac, &context->dealloc);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static int ocfs2_cow_sync_writeback(struct super_block *sb,
+				    struct ocfs2_cow_context *context,
+				    u32 cpos, u32 num_clusters)
+{
+	int ret = 0;
+	loff_t offset, end, map_end;
+	pgoff_t page_index;
+	struct page *page;
+
+	if (ocfs2_should_order_data(context->inode))
+		return 0;
+
+	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+	end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+
+	ret = filemap_fdatawrite_range(context->inode->i_mapping,
+				       offset, end - 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	while (offset < end) {
+		page_index = offset >> PAGE_CACHE_SHIFT;
+		map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
+		if (map_end > end)
+			map_end = end;
+
+		page = grab_cache_page(context->inode->i_mapping, page_index);
+		BUG_ON(!page);
+
+		wait_on_page_writeback(page);
+		if (PageError(page)) {
+			ret = -EIO;
+			mlog_errno(ret);
+		} else
+			mark_page_accessed(page);
+
+		unlock_page(page);
+		page_cache_release(page);
+		page = NULL;
+		offset = map_end;
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+					struct ocfs2_cow_context *context,
+					u32 cpos, u32 p_cluster,
+					u32 num_clusters, unsigned int e_flags)
+{
+	int ret, credits =  0;
+	u32 new_bit, new_len;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	handle_t *handle;
+
+	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+					     &context->di_et,
+					     context->ref_ci,
+					     context->ref_root_bh,
+					     &context->meta_ac,
+					     &context->data_ac, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (num_clusters) {
+		ret = __ocfs2_claim_clusters(osb, handle, context->data_ac,
+					     1, num_clusters,
+					     &new_bit, &new_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_replace_clusters(handle, context,
+					     cpos, p_cluster, new_bit,
+					     new_len, e_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		cpos += new_len;
+		p_cluster += new_len;
+		num_clusters -= new_len;
+	}
+
+	ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
+					context->ref_root_bh,
+					p_cluster, num_clusters,
+					context->meta_ac,
+					&context->dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Here we should write the new page out first if we are
+	 * in write-back mode.
+	 */
+	ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+	return ret;
+}
+
+static int ocfs2_replace_cow(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     struct buffer_head *ref_root_bh,
+			     struct ocfs2_caching_info *ref_ci,
+			     u32 cow_start, u32 cow_len)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters, start = cow_start;
+	unsigned int ext_flags;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cow_context *context;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		return -EROFS;
+	}
+
+	context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
+	if (!context) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	context->inode = inode;
+	context->cow_start = cow_start;
+	context->cow_len = cow_len;
+	context->ref_ci = ref_ci;
+	context->ref_root_bh = ref_root_bh;
+
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
+	ocfs2_init_dinode_extent_tree(&context->di_et,
+				      INODE_CACHE(inode), di_bh);
+
+	while (cow_len) {
+		ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+		if (cow_len < num_clusters)
+			num_clusters = cow_len;
+
+		ret = ocfs2_make_clusters_writable(inode->i_sb, context,
+						   cow_start, p_cluster,
+						   num_clusters, ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		cow_len -= num_clusters;
+		cow_start += num_clusters;
+	}
+
+
+	/*
+	 * truncate the extent map here since no matter whether we meet with
+	 * any error during the action, we shouldn't trust cached extent map
+	 * any more.
+	 */
+	ocfs2_extent_map_trunc(inode, start);
+
+	if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &context->dealloc);
+	}
+
+	kfree(context);
+	return ret;
+}
+
+/*
+ * Starting at cpos, try to CoW write_len clusters.
+ * This will stop when it runs into a hole or an unrefcounted extent.
+ */
+static int ocfs2_refcount_cow_hunk(struct inode *inode,
+				   struct buffer_head *di_bh,
+				   u32 cpos, u32 write_len)
+{
+	int ret;
+	u32 cow_start = 0, cow_len = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len,
+					      &cow_start, &cow_len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
+	     "cow_len %u\n", inode->i_ino,
+	     cpos, write_len, cow_start, cow_len);
+
+	BUG_ON(cow_len == 0);
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh, &ref_tree->rf_ci,
+				cow_start, cow_len);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+out:
+	return ret;
+}
+
+/*
+ * CoW any and all clusters between cpos and cpos+write_len.
+ * If this returns successfully, all clusters between cpos and
+ * cpos+write_len are safe to modify.
+ */
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	while (write_len) {
+		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (write_len < num_clusters)
+			num_clusters = write_len;
+
+		if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+			ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+						      num_clusters);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		write_len -= num_clusters;
+		cpos += num_clusters;
+	}
+
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index b8c9ed7..9960878 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -51,4 +51,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  u32 clusters,
 					  int *credits,
 					  struct ocfs2_alloc_context **meta_ac);
+int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-21 23:42       ` Joel Becker
  2009-08-22  0:31         ` Tao Ma
@ 2009-08-24 15:06         ` Tao Ma
  2009-08-24 18:32           ` Joel Becker
  1 sibling, 1 reply; 75+ messages in thread
From: Tao Ma @ 2009-08-24 15:06 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joel,
	This patch resolve the problem of directIO by CoW in
ocfs2_prepare_inode_for_write. please review.

Regards,
Tao

When we use mmap, we CoW the refcountd clusters in
ocfs2_write_begin_nolock. While for normal file
io(including directio), we do CoW in
ocfs2_prepare_inode_for_write.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/aops.c |   15 ++++++++++
 fs/ocfs2/file.c |   80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 94 insertions(+), 1 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 65e17b3..9329f8c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
+	/* We should already CoW the refcounted extent. */
+	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
 	/*
 	 * get_more_blocks() expects us to describe a hole by clearing
 	 * the mapped bit on bh_result().
@@ -1449,6 +1452,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 				goto out;
 			}
 
+			/* We should already CoW the refcountd extent. */
+			BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
 			/*
 			 * Assume worst case - that we're writing in
 			 * the middle of the extent.
@@ -1700,6 +1706,15 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		goto out;
 	}
 
+	if (ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ret = ocfs2_refcount_cow(inode, di_bh,
+					 wc->w_cpos, wc->w_clen);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
 	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
 					&extents_to_split);
 	if (ret) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4921b4e..46c256d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -1656,6 +1657,66 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 					 OCFS2_IOC_RESVSP64, &sr, change_size);
 }
 
+static int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+					  size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+					    loff_t pos, size_t count,
+					    int *meta_level)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*meta_level = 1;
+
+	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(di_bh);
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
@@ -1712,6 +1773,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 
 		end = saved_pos + count;
 
+		ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+		if (ret == 1) {
+			ocfs2_inode_unlock(inode, meta_level);
+			meta_level = -1;
+
+			ret = ocfs2_prepare_inode_for_refcount(inode,
+							       saved_pos,
+							       count,
+							       &meta_level);
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
 		/*
 		 * Skip the O_DIRECT checks if we don't need
 		 * them.
@@ -1758,7 +1835,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 		*ppos = saved_pos;
 
 out_unlock:
-	ocfs2_inode_unlock(inode, meta_level);
+	if (meta_level >= 0)
+		ocfs2_inode_unlock(inode, meta_level);
 
 out:
 	return ret;
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-24 15:04                     ` Tao Ma
@ 2009-08-24 18:20                       ` Joel Becker
  2009-08-25 19:30                       ` Joel Becker
  1 sibling, 0 replies; 75+ messages in thread
From: Joel Becker @ 2009-08-24 18:20 UTC (permalink / raw)
  To: ocfs2-devel

On Mon, Aug 24, 2009 at 11:04:12PM +0800, Tao Ma wrote:
> This patch try CoW support for a refcounted record.
> 
> the whole process will be:
> 1. Calculate how much clusters we need to CoW and where we start.
>    Currently we will CoW 1MB at most.
> 2. Do CoW for the clusters with the help of page cache.
> 3. Change the b-tree structure with the new allocated clusters.

	Looks good except for the commit header.  (1) is really
"Calculate how many clusters we need to CoW and where we start.  Extents
that are not completely encompassed by the write will be broken on 1MB
boundaries" or something like it.

Joel

-- 

Life's Little Instruction Book #69

	"Whistle"

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write.
  2009-08-24 15:06         ` Tao Ma
@ 2009-08-24 18:32           ` Joel Becker
  2009-08-25  0:12             ` [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write(add refcount check) Tao Ma
  0 siblings, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-24 18:32 UTC (permalink / raw)
  To: ocfs2-devel

On Mon, Aug 24, 2009 at 11:06:39PM +0800, Tao Ma wrote:
> @@ -1449,6 +1452,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
>  				goto out;
>  			}
>  
> +			/* We should already CoW the refcountd extent. */
> +			BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
> +
>  			/*
>  			 * Assume worst case - that we're writing in
>  			 * the middle of the extent.

	What is this patch against?  I don't see the removal of the
refcounted_cpos lines.

> +	if (ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
> +		ret = ocfs2_refcount_cow(inode, di_bh,
> +					 wc->w_cpos, wc->w_clen);
> +		if (ret) {
> +			mlog_errno(ret);
> +			goto out;
> +		}
> +	}

	Probably can check (ocfs2_refcount_tree() &&
(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FLAG)).  Why walk
the tree here if we don't need to? :-)  Actually, why not use
ocfs2_check_range_for_refcount()?

> +static int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
> +					  size_t count)
> +{
> +	int ret = 0;
> +	unsigned int extent_flags;
> +	u32 cpos, clusters, extent_len, phys_cpos;
> +	struct super_block *sb = inode->i_sb;

	And here, check ocfs2_refcount_tree(OCFS2_SB(inode->i_sb) &&
OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL), returning 0 if
not set.

> @@ -1712,6 +1773,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
>  
>  		end = saved_pos + count;
>  
> +		ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
> +		if (ret == 1) {
> +			ocfs2_inode_unlock(inode, meta_level);
> +			meta_level = -1;
> +
> +			ret = ocfs2_prepare_inode_for_refcount(inode,
> +							       saved_pos,
> +							       count,
> +							       &meta_level);
> +		}

	Because here you didn't check for refcount support.  If
ocfs2_check_range_for_refcount() has that check, then both places
benefit.

Joel

-- 

"There are only two ways to live your life. One is as though nothing
 is a miracle. The other is as though everything is a miracle."
        - Albert Einstein

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4
  2009-08-21  1:24 ` [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Joel Becker
  2009-08-21  1:39   ` Tao Ma
@ 2009-08-24 23:11   ` TaoMa
  1 sibling, 0 replies; 75+ messages in thread
From: TaoMa @ 2009-08-24 23:11 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joel,
Joel Becker wrote:
> On Tue, Aug 18, 2009 at 02:19:11PM +0800, Tao Ma wrote:
>   
>> Hi all,
>>     So I have finally finished the v4 of reflink for ocfs2. Great
>> thanks for Joel's review.
>>     
>
> 	Ok, I'm done with the review.  I basically skimmed or skipped
> all the patches I already liked.  You didn't change anything outside of
> responding to reviews of v3, right?
> 	We're in great shape.  Hopefully you'll be able to answer my
> questions and comments tomorrow and we can get this sucker into
> merge-window Monday or Tuesday!
>   
I have sent updated 17 and 19 patches. Please review.

btw I have also done some minor change(add the comments and add an error 
checker) etc and push the latest one into
my git tree in git://oss.oracle.com/git/tma/linux-2.6.git refcount.
So do I need to re-send the whole patch set or you can just pull from my 
tree?

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write(add refcount check).
  2009-08-24 18:32           ` Joel Becker
@ 2009-08-25  0:12             ` Tao Ma
  0 siblings, 0 replies; 75+ messages in thread
From: Tao Ma @ 2009-08-25  0:12 UTC (permalink / raw)
  To: ocfs2-devel

changelog:
1. add refcount check in ocfs2_check_range_for_refcount.
2. export ocfs2_check_range_for_refcount and use it in
   ocfs2_write_begin_no_lock.

When we use mmap, we CoW the refcountd clusters in
ocfs2_write_begin_nolock. While for normal file
io(including directio), we do CoW in
ocfs2_prepare_inode_for_write.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
---
 fs/ocfs2/aops.c |   19 ++++++++++++
 fs/ocfs2/file.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/file.h |    2 +
 3 files changed, 104 insertions(+), 1 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 65e17b3..0bc1942 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
+	/* We should already CoW the refcounted extent. */
+	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
 	/*
 	 * get_more_blocks() expects us to describe a hole by clearing
 	 * the mapped bit on bh_result().
@@ -1449,6 +1452,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 				goto out;
 			}
 
+			/* We should already CoW the refcountd extent. */
+			BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
 			/*
 			 * Assume worst case - that we're writing in
 			 * the middle of the extent.
@@ -1700,6 +1706,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		goto out;
 	}
 
+	ret = ocfs2_check_range_for_refcount(inode, pos, len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	} else if (ret == 1) {
+		ret = ocfs2_refcount_cow(inode, di_bh,
+					 wc->w_cpos, wc->w_clen);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
 	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
 					&extents_to_split);
 	if (ret) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4921b4e..6ee20e8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -1656,6 +1657,70 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
 					 OCFS2_IOC_RESVSP64, &sr, change_size);
 }
 
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+				   size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
+	    !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+					    loff_t pos, size_t count,
+					    int *meta_level)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*meta_level = 1;
+
+	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(di_bh);
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
@@ -1712,6 +1777,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 
 		end = saved_pos + count;
 
+		ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+		if (ret == 1) {
+			ocfs2_inode_unlock(inode, meta_level);
+			meta_level = -1;
+
+			ret = ocfs2_prepare_inode_for_refcount(inode,
+							       saved_pos,
+							       count,
+							       &meta_level);
+		}
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+
 		/*
 		 * Skip the O_DIRECT checks if we don't need
 		 * them.
@@ -1758,7 +1839,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 		*ppos = saved_pos;
 
 out_unlock:
-	ocfs2_inode_unlock(inode, meta_level);
+	if (meta_level >= 0)
+		ocfs2_inode_unlock(inode, meta_level);
 
 out:
 	return ret;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 172f9fb..d66cf4f 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -69,4 +69,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 			    struct ocfs2_space_resv *sr);
 
+int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
+				   size_t count);
 #endif /* OCFS2_FILE_H */
-- 
1.6.2.rc2.16.gf474c

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-24 15:04                     ` Tao Ma
  2009-08-24 18:20                       ` Joel Becker
@ 2009-08-25 19:30                       ` Joel Becker
  2009-08-26  8:17                         ` TaoMa
  1 sibling, 1 reply; 75+ messages in thread
From: Joel Becker @ 2009-08-25 19:30 UTC (permalink / raw)
  To: ocfs2-devel

On Mon, Aug 24, 2009 at 11:04:12PM +0800, Tao Ma wrote:
> This patch try CoW support for a refcounted record.

	Overall the series looks good.  It needs to be rebased against
the latest merge-window branch.
	I was wondering, though.  In ocfs2_refcount_cow(), should we
make sure that write_len is sane wrt max_cpos?  You added that (correct)
check for max_cpos in cal_cow_clusters().  but ocfs2_refcount_cow() is
defined as ensuring cpos+write_len is safe.
	Now, I think this is correct in the code.  The truncate CoW
passes a write_len of 1 and a max_cpos of cpos+1.  So perhaps at the top
of ocfs2_refcount_cal_cow_clusters() we should BUG_ON(cpos+write_len >
max_cpos)?  Does that sound correct to you?
	If yes, add it.  If no, don't.  Rebase against merge-window and
I'll pull later tonight or in the morning (my time) depending on when I
see it.
	It's ready to go, thank you for all the hard work!

Joel

-- 

Life's Little Instruction Book #510

	"Count your blessings."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support.
  2009-08-25 19:30                       ` Joel Becker
@ 2009-08-26  8:17                         ` TaoMa
  0 siblings, 0 replies; 75+ messages in thread
From: TaoMa @ 2009-08-26  8:17 UTC (permalink / raw)
  To: ocfs2-devel

Joel Becker wrote:
> On Mon, Aug 24, 2009 at 11:04:12PM +0800, Tao Ma wrote:
>   
>> This patch try CoW support for a refcounted record.
>>     
>
> 	Overall the series looks good.  It needs to be rebased against
> the latest merge-window branch.
> 	I was wondering, though.  In ocfs2_refcount_cow(), should we
> make sure that write_len is sane wrt max_cpos?  You added that (correct)
> check for max_cpos in cal_cow_clusters().  but ocfs2_refcount_cow() is
> defined as ensuring cpos+write_len is safe.
> 	Now, I think this is correct in the code.  The truncate CoW
> passes a write_len of 1 and a max_cpos of cpos+1.  So perhaps at the top
> of ocfs2_refcount_cal_cow_clusters() we should BUG_ON(cpos+write_len >
> max_cpos)?  Does that sound correct to you?
>   
should be. Although our current code handle this gracefully, we can't 
tell whether future caller forget this basic condition.  So I have no 
objection to it. I will add it later.
> 	If yes, add it.  If no, don't.  Rebase against merge-window and
> I'll pull later tonight or in the morning (my time) depending on when I
> see it.
>   
Sure. thanks.

Regards,
Tao

^ permalink raw reply	[flat|nested] 75+ messages in thread

end of thread, other threads:[~2009-08-26  8:17 UTC | newest]

Thread overview: 75+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-08-18  6:19 [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 01/41] ocfs2: Define refcount tree structure Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 02/41] ocfs2: Add metaecc for ocfs2_refcount_block Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 03/41] ocfs2: Add ocfs2_read_refcount_block Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 04/41] ocfs2: Abstract caching info checkpoint Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 05/41] ocfs2: Add new refcount tree lock resource in dlmglue Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 06/41] ocfs2: Add caching info for refcount tree Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 07/41] ocfs2: Add refcount tree lock mechanism Tao Ma
2009-08-19 23:25   ` Joel Becker
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 08/41] ocfs2: Basic tree root operation Tao Ma
2009-08-19 23:30   ` Joel Becker
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 09/41] ocfs2: Wrap ocfs2_extent_contig in ocfs2_extent_tree Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 10/41] ocfs2: Abstract extent split process Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 11/41] ocfs2: Add refcount b-tree as a new extent tree Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 12/41] ocfs2: move tree path functions to alloc.h Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 13/41] ocfs2: Add support for incrementing refcount in the tree Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 14/41] ocfs2: Add support of decrementing refcount for delete Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 15/41] ocfs2: Add functions for extents refcounted Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 16/41] ocfs2: Decrement refcount when truncating refcounted extents Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 17/41] ocfs2: Add CoW support Tao Ma
2009-08-21  0:59   ` Joel Becker
2009-08-21  2:04     ` Tao Ma
2009-08-21  2:51       ` Joel Becker
2009-08-21  3:04         ` Tao Ma
2009-08-21  7:10           ` Joel Becker
2009-08-21  3:55         ` Joel Becker
2009-08-21  6:25           ` Tao Ma
2009-08-21  7:07             ` Joel Becker
2009-08-21  8:24               ` Tao Ma
2009-08-21 18:39                 ` Joel Becker
2009-08-21 20:58                   ` Joel Becker
2009-08-24 15:04                     ` Tao Ma
2009-08-24 18:20                       ` Joel Becker
2009-08-25 19:30                       ` Joel Becker
2009-08-26  8:17                         ` TaoMa
2009-08-21 23:07                   ` Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 18/41] ocfs2: CoW refcount tree improvement Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write Tao Ma
2009-08-21  1:04   ` Joel Becker
2009-08-21  2:12     ` Tao Ma
2009-08-21 14:55       ` Tao Ma
2009-08-21 20:43         ` Joel Becker
2009-08-21 21:12   ` Joel Becker
2009-08-21 23:17     ` Tao Ma
2009-08-21 23:42       ` Joel Becker
2009-08-22  0:31         ` Tao Ma
2009-08-24 15:06         ` Tao Ma
2009-08-24 18:32           ` Joel Becker
2009-08-25  0:12             ` [Ocfs2-devel] [PATCH 19/41] ocfs2: Integrate CoW in file write(add refcount check) Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 20/41] ocfs2: CoW a reflinked cluster when it is truncated Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 21/41] ocfs2: Add normal functions for reflink a normal file's extents Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 22/41] ocfs2: handle file attributes issue for reflink Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 23/41] ocfs2: Return extent flags for xattr value tree Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 24/41] ocfs2: Abstract duplicate clusters process in CoW Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 25/41] ocfs2: Add CoW support for xattr Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 26/41] ocfs2: Remove inode from ocfs2_xattr_bucket_get_name_value Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 27/41] ocfs2: Abstract the creation of xattr block Tao Ma
2009-08-21  1:22   ` Joel Becker
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 28/41] ocfs2: Abstract ocfs2 xattr tree extend rec iteration process Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 29/41] ocfs2: Attach xattr clusters to refcount tree Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 30/41] ocfs2: Call refcount tree remove process properly Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 31/41] ocfs2: Create an xattr indexed block if needed Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 32/41] ocfs2: Add reflink support for xattr Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 33/41] ocfs2: Modify removing xattr process for refcount Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 34/41] ocfs2: Don't merge in 1st refcount ops of reflink Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 35/41] ocfs2: Make transaction extend more efficient Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 36/41] ocfs2: Use proper parameter for some inode operation Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 37/41] ocfs2: Create reflinked file in orphan dir Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 38/41] ocfs2: Add preserve to reflink Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 39/41] ocfs2: Implement ocfs2_reflink Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 40/41] ocfs2: Enable refcount tree support Tao Ma
2009-08-18  6:19 ` [Ocfs2-devel] [PATCH 41/41] ocfs2: Add ioctl for reflink Tao Ma
2009-08-21  1:24 ` [Ocfs2-devel] [PATCH 00/41] ocfs2: Add reflink file support. V4 Joel Becker
2009-08-21  1:39   ` Tao Ma
2009-08-24 23:11   ` TaoMa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.