* [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
@ 2016-11-09 22:51 ` Darrick J. Wong
0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2016-11-09 22:51 UTC (permalink / raw)
To: mfasheh, jlbec, darrick.wong; +Cc: linux-fsdevel, ocfs2-devel
Connect the new VFS clone_range, copy_range, and dedupe_range features
to the existing reflink capability of ocfs2. Compared to the existing
ocfs2 reflink ioctl We have to do things a little differently to support
the VFS semantics (we can clone subranges of a file but we don't clone
xattrs), but the VFS ioctls are more broadly supported.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
fs/ocfs2/file.c | 62 ++++-
fs/ocfs2/file.h | 3
fs/ocfs2/refcounttree.c | 619 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/refcounttree.h | 7 +
4 files changed, 688 insertions(+), 3 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 000c234..d5a022d 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
*done = ret;
}
-static int ocfs2_remove_inode_range(struct inode *inode,
- struct buffer_head *di_bh, u64 byte_start,
- u64 byte_len)
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len)
{
int ret = 0, flags = 0, done = 0, i;
u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
+static ssize_t ocfs2_file_copy_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ size_t len,
+ unsigned int flags)
+{
+ int error;
+
+ error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+ if (error)
+ return error;
+ return len;
+}
+
+static int ocfs2_file_clone_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len)
+{
+ return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+}
+
+#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
+ u64 loff,
+ u64 len,
+ struct file *dst_file,
+ u64 dst_loff)
+{
+ int error;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the total time spent in this
+ * ioctl to something sane.
+ */
+ if (len > OCFS2_MAX_DEDUPE_LEN)
+ len = OCFS2_MAX_DEDUPE_LEN;
+
+ error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+ len, true);
+ if (error)
+ return error;
+ return len;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .copy_file_range = ocfs2_file_copy_range,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops = {
@@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .copy_file_range = ocfs2_file_copy_range,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e8c62f2..897fd9a 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
size_t count);
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d92b6c6..3e2198c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,7 @@
#include "xattr.h"
#include "namei.h"
#include "ocfs2_trace.h"
+#include "file.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode,
return error;
}
+
+/* Update destination inode size, if necessary. */
+static int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
+{
+ handle_t *handle;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
+ int ret;
+
+ if (newlen <= i_size_read(dest))
+ return 0;
+
+ handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ spin_lock(&OCFS2_I(dest)->ip_lock);
+ if (newlen > i_size_read(dest)) {
+ i_size_write(dest, newlen);
+ di->i_size = newlen;
+ }
+ spin_unlock(&OCFS2_I(dest)->ip_lock);
+
+ ocfs2_journal_dirty(handle, d_bh);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+ return ret;
+}
+
+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_extent_tree s_et;
+ struct ocfs2_extent_tree t_et;
+ struct ocfs2_dinode *dis;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_super *osb;
+ loff_t pstart, plen;
+ u32 p_cluster, num_clusters, slast, spos, tpos;
+ unsigned int ext_flags;
+ int ret = 0;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+ ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+
+ spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+ tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+ slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+
+ while (spos < slast) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ /* Look up the extent. */
+ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_clusters = min_t(u32, num_clusters, slast - spos);
+
+ /* Punch out the dest range. */
+ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+ plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+ ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (p_cluster == 0)
+ goto next_loop;
+
+ /* Lock the refcount btree... */
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(dis->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Mark s_inode's extent as refcounted. */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, spos,
+ p_cluster, num_clusters,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+ }
+
+ /* Map in the new extent. */
+ ext_flags |= OCFS2_EXT_REFCOUNTED;
+ ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ tpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+next_loop:
+ spos += num_clusters;
+ tpos += num_clusters;
+ }
+
+out:
+ return ret;
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/* Set up refcount tree and remap s_inode to t_inode. */
+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb;
+ struct ocfs2_dinode *dis;
+ struct ocfs2_dinode *dit;
+ int ret;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ dit = (struct ocfs2_dinode *)t_bh->b_data;
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /*
+ * If both inodes belong to two different refcount groups then
+ * forget it because we don't know how (or want) to go merging
+ * refcount trees.
+ */
+ ret = -EOPNOTSUPP;
+ if (ocfs2_is_refcount_inode(s_inode) &&
+ ocfs2_is_refcount_inode(t_inode) &&
+ le64_to_cpu(dis->i_refcount_loc) !=
+ le64_to_cpu(dit->i_refcount_loc))
+ goto out;
+
+ /* Neither inode has a refcount tree. Add one to s_inode. */
+ if (!ocfs2_is_refcount_inode(s_inode) &&
+ !ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Ensure that both inodes end up with the same refcount tree. */
+ if (!ocfs2_is_refcount_inode(s_inode)) {
+ ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+ le64_to_cpu(dit->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ if (!ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(dis->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /*
+ * If we're reflinking the entire file and the source is inline
+ * data, just copy the contents.
+ */
+ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+ i_size_read(t_inode) <= len &&
+ (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+ pos_out, len, &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+/* Lock an inode and grab a bh pointing to the inode. */
+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2)
+{
+ struct inode *inode1;
+ struct inode *inode2;
+ struct ocfs2_inode_info *oi1;
+ struct ocfs2_inode_info *oi2;
+ bool same_inode = (s_inode == t_inode);
+ int status;
+
+ /* First grab the VFS and rw locks. */
+ inode1 = s_inode;
+ inode2 = t_inode;
+ if (inode1->i_ino > inode2->i_ino)
+ swap(inode1, inode2);
+
+ inode_lock(inode1);
+ status = ocfs2_rw_lock(inode1, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i1;
+ }
+ if (!same_inode) {
+ inode_lock_nested(inode2, I_MUTEX_CHILD);
+ status = ocfs2_rw_lock(inode2, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i2;
+ }
+ }
+
+ /* Now go for the cluster locks */
+ oi1 = OCFS2_I(inode1);
+ oi2 = OCFS2_I(inode2);
+
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
+
+ if (*bh1)
+ *bh1 = NULL;
+ if (*bh2)
+ *bh2 = NULL;
+
+ /* We always want to lock the one with the lower lockid first. */
+ if (oi1->ip_blkno > oi2->ip_blkno)
+ mlog_errno(-ENOLCK);
+
+ /* lock id1 */
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_rw2;
+ }
+
+ /* lock id2 */
+ if (!same_inode) {
+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_cl1;
+ }
+ } else
+ *bh2 = *bh1;
+
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+ return 0;
+
+out_cl1:
+ ocfs2_inode_unlock(inode1, 1);
+ brelse(*bh1);
+ *bh1 = NULL;
+out_rw2:
+ ocfs2_rw_unlock(inode2, 1);
+out_i2:
+ inode_unlock(inode2);
+ ocfs2_rw_unlock(inode1, 1);
+out_i1:
+ inode_unlock(inode1);
+ return status;
+}
+
+/* Unlock both inodes and release buffers. */
+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ ocfs2_inode_unlock(s_inode, 1);
+ ocfs2_rw_unlock(s_inode, 1);
+ inode_unlock(s_inode);
+ brelse(s_bh);
+
+ if (s_inode == t_inode)
+ return;
+
+ ocfs2_inode_unlock(t_inode, 1);
+ ocfs2_rw_unlock(t_inode, 1);
+ inode_unlock(t_inode);
+ brelse(t_bh);
+}
+
+/*
+ * Read a page's worth of file data into the page cache. Return the page
+ * locked.
+ */
+static struct page *ocfs2_reflink_get_page(struct inode *inode,
+ loff_t offset)
+{
+ struct address_space *mapping;
+ struct page *page;
+ pgoff_t n;
+
+ n = offset >> PAGE_SHIFT;
+ mapping = inode->i_mapping;
+ page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ lock_page(page);
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int ocfs2_reflink_compare_extents(struct inode *src,
+ loff_t srcoff,
+ struct inode *dest,
+ loff_t destoff,
+ loff_t len,
+ bool *is_same)
+{
+ loff_t src_poff;
+ loff_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ loff_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ if (cmp_len <= 0) {
+ mlog_errno(-EUCLEAN);
+ goto out_error;
+ }
+
+ src_page = ocfs2_reflink_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = ocfs2_reflink_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ unlock_page(src_page);
+ put_page(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+ unlock_page(dest_page);
+ unlock_page(src_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ return error;
+}
+
+/* Link a range of blocks from one file to another. */
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
+ loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
+ bool same_inode = (inode_in == inode_out);
+ bool is_same = false;
+ loff_t isize;
+ ssize_t ret;
+ loff_t blen;
+
+ if (!ocfs2_refcount_tree(osb))
+ return -EOPNOTSUPP;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ /* Lock both files against IO */
+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+ goto out_unlock;
+
+ /* Don't touch certain kinds of inodes */
+ ret = -EPERM;
+ if (IS_IMMUTABLE(inode_out))
+ goto out_unlock;
+
+ ret = -ETXTBSY;
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ goto out_unlock;
+
+ /* Don't reflink dirs, pipes, sockets... */
+ ret = -EISDIR;
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ goto out_unlock;
+ ret = -EINVAL;
+ if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+ goto out_unlock;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ goto out_unlock;
+
+ /* Are we going all the way to the end? */
+ isize = i_size_read(inode_in);
+ if (isize == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (len == 0)
+ len = isize - pos_in;
+
+ /* Ensure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > isize)
+ goto out_unlock;
+
+ /* Don't allow dedupe past EOF in the dest file */
+ if (is_dedupe) {
+ loff_t disize;
+
+ disize = i_size_read(inode_out);
+ if (pos_out >= disize || pos_out + len > disize)
+ goto out_unlock;
+ }
+
+ /* If we're linking to EOF, continue to the block boundary. */
+ if (pos_in + len == isize)
+ blen = ALIGN(isize, bs) - pos_in;
+ else
+ blen = len;
+
+ /* Only reflink if we're aligned to block boundaries */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+ goto out_unlock;
+
+ /* Don't allow overlapped reflink within the same file */
+ if (same_inode) {
+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+ goto out_unlock;
+ }
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
+ pos_in, pos_in + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Check that the extents are the same.
+ */
+ if (is_dedupe) {
+ ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
+ inode_out, pos_out,
+ len, &is_same);
+ if (ret)
+ goto out_unlock;
+ if (!is_same) {
+ ret = -EBADE;
+ goto out_unlock;
+ }
+ }
+
+ /* Lock out changes to the allocation maps */
+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+ /*
+ * Invalidate the page cache so that we can clear any CoW mappings
+ * in the destination file.
+ */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + len) - 1);
+
+ ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+ out_bh, pos_out, len);
+
+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode_in, 0);
+ ocfs2_extent_map_trunc(inode_out, 0);
+
+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return 0;
+
+out_unlock:
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 553edfb..c023e88 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe);
+
#endif /* OCFS2_REFCOUNTTREE_H */
^ permalink raw reply related [flat|nested] 42+ messages in thread
* Re: [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
2016-11-09 22:51 ` [Ocfs2-devel] " Darrick J. Wong
@ 2016-11-11 5:49 ` Eric Ren
-1 siblings, 0 replies; 42+ messages in thread
From: Eric Ren @ 2016-11-11 5:49 UTC (permalink / raw)
To: Darrick J. Wong, mfasheh, jlbec; +Cc: linux-fsdevel, ocfs2-devel
Hi,
A few issues obvious to me:
On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> Connect the new VFS clone_range, copy_range, and dedupe_range features
> to the existing reflink capability of ocfs2. Compared to the existing
> ocfs2 reflink ioctl We have to do things a little differently to support
> the VFS semantics (we can clone subranges of a file but we don't clone
> xattrs), but the VFS ioctls are more broadly supported.
How can I test the new ocfs2 reflink (with this patch) manually? What commands should I
use to do xxx_range things?
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/ocfs2/file.c | 62 ++++-
> fs/ocfs2/file.h | 3
> fs/ocfs2/refcounttree.c | 619 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/refcounttree.h | 7 +
> 4 files changed, 688 insertions(+), 3 deletions(-)
>
>
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 000c234..d5a022d 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
> *done = ret;
> }
>
> -static int ocfs2_remove_inode_range(struct inode *inode,
> - struct buffer_head *di_bh, u64 byte_start,
> - u64 byte_len)
> +int ocfs2_remove_inode_range(struct inode *inode,
> + struct buffer_head *di_bh, u64 byte_start,
> + u64 byte_len)
> {
> int ret = 0, flags = 0, done = 0, i;
> u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
> @@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
> return offset;
> }
>
> +static ssize_t ocfs2_file_copy_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + size_t len,
> + unsigned int flags)
> +{
> + int error;
> +
> + error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> + len, false);
> + if (error)
> + return error;
> + return len;
> +}
> +
> +static int ocfs2_file_clone_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + u64 len)
> +{
> + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> + len, false);
> +}
> +
> +#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024)
> +static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
> + u64 loff,
> + u64 len,
> + struct file *dst_file,
> + u64 dst_loff)
> +{
> + int error;
> +
> + /*
> + * Limit the total length we will dedupe for each operation.
> + * This is intended to bound the total time spent in this
> + * ioctl to something sane.
> + */
> + if (len > OCFS2_MAX_DEDUPE_LEN)
> + len = OCFS2_MAX_DEDUPE_LEN;
> +
> + error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
> + len, true);
> + if (error)
> + return error;
> + return len;
> +}
> +
> const struct inode_operations ocfs2_file_iops = {
> .setattr = ocfs2_setattr,
> .getattr = ocfs2_getattr,
> @@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = {
> .splice_read = generic_file_splice_read,
> .splice_write = iter_file_splice_write,
> .fallocate = ocfs2_fallocate,
> + .copy_file_range = ocfs2_file_copy_range,
> + .clone_file_range = ocfs2_file_clone_range,
> + .dedupe_file_range = ocfs2_file_dedupe_range,
> };
>
> const struct file_operations ocfs2_dops = {
> @@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
> .splice_read = generic_file_splice_read,
> .splice_write = iter_file_splice_write,
> .fallocate = ocfs2_fallocate,
> + .copy_file_range = ocfs2_file_copy_range,
> + .clone_file_range = ocfs2_file_clone_range,
> + .dedupe_file_range = ocfs2_file_dedupe_range,
> };
>
> const struct file_operations ocfs2_dops_no_plocks = {
> diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
> index e8c62f2..897fd9a 100644
> --- a/fs/ocfs2/file.h
> +++ b/fs/ocfs2/file.h
> @@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
>
> int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
> size_t count);
> +int ocfs2_remove_inode_range(struct inode *inode,
> + struct buffer_head *di_bh, u64 byte_start,
> + u64 byte_len);
> #endif /* OCFS2_FILE_H */
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index d92b6c6..3e2198c 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -34,6 +34,7 @@
> #include "xattr.h"
> #include "namei.h"
> #include "ocfs2_trace.h"
> +#include "file.h"
>
> #include <linux/bio.h>
> #include <linux/blkdev.h>
> @@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode,
>
> return error;
> }
> +
> +/* Update destination inode size, if necessary. */
> +static int ocfs2_reflink_update_dest(struct inode *dest,
> + struct buffer_head *d_bh,
> + loff_t newlen)
> +{
> + handle_t *handle;
> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
> + int ret;
> +
> + if (newlen <= i_size_read(dest))
> + return 0;
> +
> + handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
> + OCFS2_INODE_UPDATE_CREDITS);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + mlog_errno(ret);
> + return ret;
> + }
> +
> + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
> + OCFS2_JOURNAL_ACCESS_WRITE);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_commit;
> + }
> +
> + spin_lock(&OCFS2_I(dest)->ip_lock);
> + if (newlen > i_size_read(dest)) {
> + i_size_write(dest, newlen);
> + di->i_size = newlen;
di->i_size = cpu_to_le64(newlen);
> + }
> + spin_unlock(&OCFS2_I(dest)->ip_lock);
> +
Add ocfs2_update_inode_fsync_trans() here? Looks this function was introduced by you to
improve efficiency.
Just want to awake your memory about this, though I don't know about the details why it
should be.
Eric
> + ocfs2_journal_dirty(handle, d_bh);
> +
> +out_commit:
> + ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
> + return ret;
> +}
> +
> +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
> +static int ocfs2_reflink_remap_extent(struct inode *s_inode,
> + struct buffer_head *s_bh,
> + loff_t pos_in,
> + struct inode *t_inode,
> + struct buffer_head *t_bh,
> + loff_t pos_out,
> + loff_t len,
> + struct ocfs2_cached_dealloc_ctxt *dealloc)
> +{
> + struct ocfs2_extent_tree s_et;
> + struct ocfs2_extent_tree t_et;
> + struct ocfs2_dinode *dis;
> + struct buffer_head *ref_root_bh = NULL;
> + struct ocfs2_refcount_tree *ref_tree;
> + struct ocfs2_super *osb;
> + loff_t pstart, plen;
> + u32 p_cluster, num_clusters, slast, spos, tpos;
> + unsigned int ext_flags;
> + int ret = 0;
> +
> + osb = OCFS2_SB(s_inode->i_sb);
> + dis = (struct ocfs2_dinode *)s_bh->b_data;
> + ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
> + ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
> +
> + spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
> + tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
> + slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
> +
> + while (spos < slast) {
> + if (fatal_signal_pending(current)) {
> + ret = -EINTR;
> + goto out;
> + }
> +
> + /* Look up the extent. */
> + ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
> + &num_clusters, &ext_flags);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + num_clusters = min_t(u32, num_clusters, slast - spos);
> +
> + /* Punch out the dest range. */
> + pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
> + plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
> + ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + if (p_cluster == 0)
> + goto next_loop;
> +
> + /* Lock the refcount btree... */
> + ret = ocfs2_lock_refcount_tree(osb,
> + le64_to_cpu(dis->i_refcount_loc),
> + 1, &ref_tree, &ref_root_bh);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + /* Mark s_inode's extent as refcounted. */
> + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
> + ret = ocfs2_add_refcount_flag(s_inode, &s_et,
> + &ref_tree->rf_ci,
> + ref_root_bh, spos,
> + p_cluster, num_clusters,
> + dealloc, NULL);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock_refcount;
> + }
> + }
> +
> + /* Map in the new extent. */
> + ext_flags |= OCFS2_EXT_REFCOUNTED;
> + ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
> + &ref_tree->rf_ci,
> + ref_root_bh,
> + tpos, p_cluster,
> + num_clusters,
> + ext_flags,
> + dealloc);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock_refcount;
> + }
> +
> + ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> + brelse(ref_root_bh);
> +next_loop:
> + spos += num_clusters;
> + tpos += num_clusters;
> + }
> +
> +out:
> + return ret;
> +out_unlock_refcount:
> + ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> + brelse(ref_root_bh);
> + return ret;
> +}
> +
> +/* Set up refcount tree and remap s_inode to t_inode. */
> +static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
> + struct buffer_head *s_bh,
> + loff_t pos_in,
> + struct inode *t_inode,
> + struct buffer_head *t_bh,
> + loff_t pos_out,
> + loff_t len)
> +{
> + struct ocfs2_cached_dealloc_ctxt dealloc;
> + struct ocfs2_super *osb;
> + struct ocfs2_dinode *dis;
> + struct ocfs2_dinode *dit;
> + int ret;
> +
> + osb = OCFS2_SB(s_inode->i_sb);
> + dis = (struct ocfs2_dinode *)s_bh->b_data;
> + dit = (struct ocfs2_dinode *)t_bh->b_data;
> + ocfs2_init_dealloc_ctxt(&dealloc);
> +
> + /*
> + * If both inodes belong to two different refcount groups then
> + * forget it because we don't know how (or want) to go merging
> + * refcount trees.
> + */
> + ret = -EOPNOTSUPP;
> + if (ocfs2_is_refcount_inode(s_inode) &&
> + ocfs2_is_refcount_inode(t_inode) &&
> + le64_to_cpu(dis->i_refcount_loc) !=
> + le64_to_cpu(dit->i_refcount_loc))
> + goto out;
> +
> + /* Neither inode has a refcount tree. Add one to s_inode. */
> + if (!ocfs2_is_refcount_inode(s_inode) &&
> + !ocfs2_is_refcount_inode(t_inode)) {
> + ret = ocfs2_create_refcount_tree(s_inode, s_bh);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> + }
> +
> + /* Ensure that both inodes end up with the same refcount tree. */
> + if (!ocfs2_is_refcount_inode(s_inode)) {
> + ret = ocfs2_set_refcount_tree(s_inode, s_bh,
> + le64_to_cpu(dit->i_refcount_loc));
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> + }
> + if (!ocfs2_is_refcount_inode(t_inode)) {
> + ret = ocfs2_set_refcount_tree(t_inode, t_bh,
> + le64_to_cpu(dis->i_refcount_loc));
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> + }
> +
> + /*
> + * If we're reflinking the entire file and the source is inline
> + * data, just copy the contents.
> + */
> + if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
> + i_size_read(t_inode) <= len &&
> + (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
> + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
> + if (ret)
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
> + pos_out, len, &dealloc);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> +out:
> + if (ocfs2_dealloc_has_cluster(&dealloc)) {
> + ocfs2_schedule_truncate_log_flush(osb, 1);
> + ocfs2_run_deallocs(osb, &dealloc);
> + }
> +
> + return ret;
> +}
> +
> +/* Lock an inode and grab a bh pointing to the inode. */
> +static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
> + struct buffer_head **bh1,
> + struct inode *t_inode,
> + struct buffer_head **bh2)
> +{
> + struct inode *inode1;
> + struct inode *inode2;
> + struct ocfs2_inode_info *oi1;
> + struct ocfs2_inode_info *oi2;
> + bool same_inode = (s_inode == t_inode);
> + int status;
> +
> + /* First grab the VFS and rw locks. */
> + inode1 = s_inode;
> + inode2 = t_inode;
> + if (inode1->i_ino > inode2->i_ino)
> + swap(inode1, inode2);
> +
> + inode_lock(inode1);
> + status = ocfs2_rw_lock(inode1, 1);
> + if (status) {
> + mlog_errno(status);
> + goto out_i1;
> + }
> + if (!same_inode) {
> + inode_lock_nested(inode2, I_MUTEX_CHILD);
> + status = ocfs2_rw_lock(inode2, 1);
> + if (status) {
> + mlog_errno(status);
> + goto out_i2;
> + }
> + }
> +
> + /* Now go for the cluster locks */
> + oi1 = OCFS2_I(inode1);
> + oi2 = OCFS2_I(inode2);
> +
> + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
> + (unsigned long long)oi2->ip_blkno);
> +
> + if (*bh1)
> + *bh1 = NULL;
> + if (*bh2)
> + *bh2 = NULL;
> +
> + /* We always want to lock the one with the lower lockid first. */
> + if (oi1->ip_blkno > oi2->ip_blkno)
> + mlog_errno(-ENOLCK);
> +
> + /* lock id1 */
> + status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
> + if (status < 0) {
> + if (status != -ENOENT)
> + mlog_errno(status);
> + goto out_rw2;
> + }
> +
> + /* lock id2 */
> + if (!same_inode) {
> + status = ocfs2_inode_lock_nested(inode2, bh2, 1,
> + OI_LS_REFLINK_TARGET);
> + if (status < 0) {
> + if (status != -ENOENT)
> + mlog_errno(status);
> + goto out_cl1;
> + }
> + } else
> + *bh2 = *bh1;
> +
> + trace_ocfs2_double_lock_end(
> + (unsigned long long)OCFS2_I(inode1)->ip_blkno,
> + (unsigned long long)OCFS2_I(inode2)->ip_blkno);
> +
> + return 0;
> +
> +out_cl1:
> + ocfs2_inode_unlock(inode1, 1);
> + brelse(*bh1);
> + *bh1 = NULL;
> +out_rw2:
> + ocfs2_rw_unlock(inode2, 1);
> +out_i2:
> + inode_unlock(inode2);
> + ocfs2_rw_unlock(inode1, 1);
> +out_i1:
> + inode_unlock(inode1);
> + return status;
> +}
> +
> +/* Unlock both inodes and release buffers. */
> +static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
> + struct buffer_head *s_bh,
> + struct inode *t_inode,
> + struct buffer_head *t_bh)
> +{
> + ocfs2_inode_unlock(s_inode, 1);
> + ocfs2_rw_unlock(s_inode, 1);
> + inode_unlock(s_inode);
> + brelse(s_bh);
> +
> + if (s_inode == t_inode)
> + return;
> +
> + ocfs2_inode_unlock(t_inode, 1);
> + ocfs2_rw_unlock(t_inode, 1);
> + inode_unlock(t_inode);
> + brelse(t_bh);
> +}
> +
> +/*
> + * Read a page's worth of file data into the page cache. Return the page
> + * locked.
> + */
> +static struct page *ocfs2_reflink_get_page(struct inode *inode,
> + loff_t offset)
> +{
> + struct address_space *mapping;
> + struct page *page;
> + pgoff_t n;
> +
> + n = offset >> PAGE_SHIFT;
> + mapping = inode->i_mapping;
> + page = read_mapping_page(mapping, n, NULL);
> + if (IS_ERR(page))
> + return page;
> + if (!PageUptodate(page)) {
> + put_page(page);
> + return ERR_PTR(-EIO);
> + }
> + lock_page(page);
> + return page;
> +}
> +
> +/*
> + * Compare extents of two files to see if they are the same.
> + */
> +static int ocfs2_reflink_compare_extents(struct inode *src,
> + loff_t srcoff,
> + struct inode *dest,
> + loff_t destoff,
> + loff_t len,
> + bool *is_same)
> +{
> + loff_t src_poff;
> + loff_t dest_poff;
> + void *src_addr;
> + void *dest_addr;
> + struct page *src_page;
> + struct page *dest_page;
> + loff_t cmp_len;
> + bool same;
> + int error;
> +
> + error = -EINVAL;
> + same = true;
> + while (len) {
> + src_poff = srcoff & (PAGE_SIZE - 1);
> + dest_poff = destoff & (PAGE_SIZE - 1);
> + cmp_len = min(PAGE_SIZE - src_poff,
> + PAGE_SIZE - dest_poff);
> + cmp_len = min(cmp_len, len);
> + if (cmp_len <= 0) {
> + mlog_errno(-EUCLEAN);
> + goto out_error;
> + }
> +
> + src_page = ocfs2_reflink_get_page(src, srcoff);
> + if (IS_ERR(src_page)) {
> + error = PTR_ERR(src_page);
> + goto out_error;
> + }
> + dest_page = ocfs2_reflink_get_page(dest, destoff);
> + if (IS_ERR(dest_page)) {
> + error = PTR_ERR(dest_page);
> + unlock_page(src_page);
> + put_page(src_page);
> + goto out_error;
> + }
> + src_addr = kmap_atomic(src_page);
> + dest_addr = kmap_atomic(dest_page);
> +
> + flush_dcache_page(src_page);
> + flush_dcache_page(dest_page);
> +
> + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
> + same = false;
> +
> + kunmap_atomic(dest_addr);
> + kunmap_atomic(src_addr);
> + unlock_page(dest_page);
> + unlock_page(src_page);
> + put_page(dest_page);
> + put_page(src_page);
> +
> + if (!same)
> + break;
> +
> + srcoff += cmp_len;
> + destoff += cmp_len;
> + len -= cmp_len;
> + }
> +
> + *is_same = same;
> + return 0;
> +
> +out_error:
> + return error;
> +}
> +
> +/* Link a range of blocks from one file to another. */
> +int ocfs2_reflink_remap_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + u64 len,
> + bool is_dedupe)
> +{
> + struct inode *inode_in = file_inode(file_in);
> + struct inode *inode_out = file_inode(file_out);
> + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
> + struct buffer_head *in_bh = NULL, *out_bh = NULL;
> + loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
> + bool same_inode = (inode_in == inode_out);
> + bool is_same = false;
> + loff_t isize;
> + ssize_t ret;
> + loff_t blen;
> +
> + if (!ocfs2_refcount_tree(osb))
> + return -EOPNOTSUPP;
> + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> + return -EROFS;
> +
> + /* Lock both files against IO */
> + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
> + if (ret)
> + return ret;
> +
> + ret = -EINVAL;
> + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
> + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
> + goto out_unlock;
> +
> + /* Don't touch certain kinds of inodes */
> + ret = -EPERM;
> + if (IS_IMMUTABLE(inode_out))
> + goto out_unlock;
> +
> + ret = -ETXTBSY;
> + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
> + goto out_unlock;
> +
> + /* Don't reflink dirs, pipes, sockets... */
> + ret = -EISDIR;
> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> + goto out_unlock;
> + ret = -EINVAL;
> + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
> + goto out_unlock;
> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> + goto out_unlock;
> +
> + /* Are we going all the way to the end? */
> + isize = i_size_read(inode_in);
> + if (isize == 0) {
> + ret = 0;
> + goto out_unlock;
> + }
> +
> + if (len == 0)
> + len = isize - pos_in;
> +
> + /* Ensure offsets don't wrap and the input is inside i_size */
> + if (pos_in + len < pos_in || pos_out + len < pos_out ||
> + pos_in + len > isize)
> + goto out_unlock;
> +
> + /* Don't allow dedupe past EOF in the dest file */
> + if (is_dedupe) {
> + loff_t disize;
> +
> + disize = i_size_read(inode_out);
> + if (pos_out >= disize || pos_out + len > disize)
> + goto out_unlock;
> + }
> +
> + /* If we're linking to EOF, continue to the block boundary. */
> + if (pos_in + len == isize)
> + blen = ALIGN(isize, bs) - pos_in;
> + else
> + blen = len;
> +
> + /* Only reflink if we're aligned to block boundaries */
> + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
> + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
> + goto out_unlock;
> +
> + /* Don't allow overlapped reflink within the same file */
> + if (same_inode) {
> + if (pos_out + blen > pos_in && pos_out < pos_in + blen)
> + goto out_unlock;
> + }
> +
> + /* Wait for the completion of any pending IOs on both files */
> + inode_dio_wait(inode_in);
> + if (!same_inode)
> + inode_dio_wait(inode_out);
> +
> + ret = filemap_write_and_wait_range(inode_in->i_mapping,
> + pos_in, pos_in + len - 1);
> + if (ret)
> + goto out_unlock;
> +
> + ret = filemap_write_and_wait_range(inode_out->i_mapping,
> + pos_out, pos_out + len - 1);
> + if (ret)
> + goto out_unlock;
> +
> + /*
> + * Check that the extents are the same.
> + */
> + if (is_dedupe) {
> + ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
> + inode_out, pos_out,
> + len, &is_same);
> + if (ret)
> + goto out_unlock;
> + if (!is_same) {
> + ret = -EBADE;
> + goto out_unlock;
> + }
> + }
> +
> + /* Lock out changes to the allocation maps */
> + down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> + if (!same_inode)
> + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
> + SINGLE_DEPTH_NESTING);
> +
> + /*
> + * Invalidate the page cache so that we can clear any CoW mappings
> + * in the destination file.
> + */
> + truncate_inode_pages_range(&inode_out->i_data, pos_out,
> + PAGE_ALIGN(pos_out + len) - 1);
> +
> + ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
> + out_bh, pos_out, len);
> +
> + up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> + if (!same_inode)
> + up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock;
> + }
> +
> + /*
> + * Empty the extent map so that we may get the right extent
> + * record from the disk.
> + */
> + ocfs2_extent_map_trunc(inode_in, 0);
> + ocfs2_extent_map_trunc(inode_out, 0);
> +
> + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock;
> + }
> +
> + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> + return 0;
> +
> +out_unlock:
> + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> + return ret;
> +}
> diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
> index 553edfb..c023e88 100644
> --- a/fs/ocfs2/refcounttree.h
> +++ b/fs/ocfs2/refcounttree.h
> @@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> const char __user *oldname,
> const char __user *newname,
> bool preserve);
> +int ocfs2_reflink_remap_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + u64 len,
> + bool is_dedupe);
> +
> #endif /* OCFS2_REFCOUNTTREE_H */
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
@ 2016-11-11 5:49 ` Eric Ren
0 siblings, 0 replies; 42+ messages in thread
From: Eric Ren @ 2016-11-11 5:49 UTC (permalink / raw)
To: Darrick J. Wong, mfasheh, jlbec; +Cc: linux-fsdevel, ocfs2-devel
Hi,
A few issues obvious to me:
On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> Connect the new VFS clone_range, copy_range, and dedupe_range features
> to the existing reflink capability of ocfs2. Compared to the existing
> ocfs2 reflink ioctl We have to do things a little differently to support
> the VFS semantics (we can clone subranges of a file but we don't clone
> xattrs), but the VFS ioctls are more broadly supported.
How can I test the new ocfs2 reflink (with this patch) manually? What commands should I
use to do xxx_range things?
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> fs/ocfs2/file.c | 62 ++++-
> fs/ocfs2/file.h | 3
> fs/ocfs2/refcounttree.c | 619 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/ocfs2/refcounttree.h | 7 +
> 4 files changed, 688 insertions(+), 3 deletions(-)
>
>
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 000c234..d5a022d 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
> *done = ret;
> }
>
> -static int ocfs2_remove_inode_range(struct inode *inode,
> - struct buffer_head *di_bh, u64 byte_start,
> - u64 byte_len)
> +int ocfs2_remove_inode_range(struct inode *inode,
> + struct buffer_head *di_bh, u64 byte_start,
> + u64 byte_len)
> {
> int ret = 0, flags = 0, done = 0, i;
> u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
> @@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
> return offset;
> }
>
> +static ssize_t ocfs2_file_copy_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + size_t len,
> + unsigned int flags)
> +{
> + int error;
> +
> + error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> + len, false);
> + if (error)
> + return error;
> + return len;
> +}
> +
> +static int ocfs2_file_clone_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + u64 len)
> +{
> + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> + len, false);
> +}
> +
> +#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024)
> +static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
> + u64 loff,
> + u64 len,
> + struct file *dst_file,
> + u64 dst_loff)
> +{
> + int error;
> +
> + /*
> + * Limit the total length we will dedupe for each operation.
> + * This is intended to bound the total time spent in this
> + * ioctl to something sane.
> + */
> + if (len > OCFS2_MAX_DEDUPE_LEN)
> + len = OCFS2_MAX_DEDUPE_LEN;
> +
> + error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
> + len, true);
> + if (error)
> + return error;
> + return len;
> +}
> +
> const struct inode_operations ocfs2_file_iops = {
> .setattr = ocfs2_setattr,
> .getattr = ocfs2_getattr,
> @@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = {
> .splice_read = generic_file_splice_read,
> .splice_write = iter_file_splice_write,
> .fallocate = ocfs2_fallocate,
> + .copy_file_range = ocfs2_file_copy_range,
> + .clone_file_range = ocfs2_file_clone_range,
> + .dedupe_file_range = ocfs2_file_dedupe_range,
> };
>
> const struct file_operations ocfs2_dops = {
> @@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
> .splice_read = generic_file_splice_read,
> .splice_write = iter_file_splice_write,
> .fallocate = ocfs2_fallocate,
> + .copy_file_range = ocfs2_file_copy_range,
> + .clone_file_range = ocfs2_file_clone_range,
> + .dedupe_file_range = ocfs2_file_dedupe_range,
> };
>
> const struct file_operations ocfs2_dops_no_plocks = {
> diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
> index e8c62f2..897fd9a 100644
> --- a/fs/ocfs2/file.h
> +++ b/fs/ocfs2/file.h
> @@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
>
> int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
> size_t count);
> +int ocfs2_remove_inode_range(struct inode *inode,
> + struct buffer_head *di_bh, u64 byte_start,
> + u64 byte_len);
> #endif /* OCFS2_FILE_H */
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index d92b6c6..3e2198c 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -34,6 +34,7 @@
> #include "xattr.h"
> #include "namei.h"
> #include "ocfs2_trace.h"
> +#include "file.h"
>
> #include <linux/bio.h>
> #include <linux/blkdev.h>
> @@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode,
>
> return error;
> }
> +
> +/* Update destination inode size, if necessary. */
> +static int ocfs2_reflink_update_dest(struct inode *dest,
> + struct buffer_head *d_bh,
> + loff_t newlen)
> +{
> + handle_t *handle;
> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
> + int ret;
> +
> + if (newlen <= i_size_read(dest))
> + return 0;
> +
> + handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
> + OCFS2_INODE_UPDATE_CREDITS);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + mlog_errno(ret);
> + return ret;
> + }
> +
> + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
> + OCFS2_JOURNAL_ACCESS_WRITE);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_commit;
> + }
> +
> + spin_lock(&OCFS2_I(dest)->ip_lock);
> + if (newlen > i_size_read(dest)) {
> + i_size_write(dest, newlen);
> + di->i_size = newlen;
di->i_size = cpu_to_le64(newlen);
> + }
> + spin_unlock(&OCFS2_I(dest)->ip_lock);
> +
Add ocfs2_update_inode_fsync_trans() here? Looks this function was introduced by you to
improve efficiency.
Just want to awake your memory about this, though I don't know about the details why it
should be.
Eric
> + ocfs2_journal_dirty(handle, d_bh);
> +
> +out_commit:
> + ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
> + return ret;
> +}
> +
> +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
> +static int ocfs2_reflink_remap_extent(struct inode *s_inode,
> + struct buffer_head *s_bh,
> + loff_t pos_in,
> + struct inode *t_inode,
> + struct buffer_head *t_bh,
> + loff_t pos_out,
> + loff_t len,
> + struct ocfs2_cached_dealloc_ctxt *dealloc)
> +{
> + struct ocfs2_extent_tree s_et;
> + struct ocfs2_extent_tree t_et;
> + struct ocfs2_dinode *dis;
> + struct buffer_head *ref_root_bh = NULL;
> + struct ocfs2_refcount_tree *ref_tree;
> + struct ocfs2_super *osb;
> + loff_t pstart, plen;
> + u32 p_cluster, num_clusters, slast, spos, tpos;
> + unsigned int ext_flags;
> + int ret = 0;
> +
> + osb = OCFS2_SB(s_inode->i_sb);
> + dis = (struct ocfs2_dinode *)s_bh->b_data;
> + ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
> + ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
> +
> + spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
> + tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
> + slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
> +
> + while (spos < slast) {
> + if (fatal_signal_pending(current)) {
> + ret = -EINTR;
> + goto out;
> + }
> +
> + /* Look up the extent. */
> + ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
> + &num_clusters, &ext_flags);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + num_clusters = min_t(u32, num_clusters, slast - spos);
> +
> + /* Punch out the dest range. */
> + pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
> + plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
> + ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + if (p_cluster == 0)
> + goto next_loop;
> +
> + /* Lock the refcount btree... */
> + ret = ocfs2_lock_refcount_tree(osb,
> + le64_to_cpu(dis->i_refcount_loc),
> + 1, &ref_tree, &ref_root_bh);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + /* Mark s_inode's extent as refcounted. */
> + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
> + ret = ocfs2_add_refcount_flag(s_inode, &s_et,
> + &ref_tree->rf_ci,
> + ref_root_bh, spos,
> + p_cluster, num_clusters,
> + dealloc, NULL);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock_refcount;
> + }
> + }
> +
> + /* Map in the new extent. */
> + ext_flags |= OCFS2_EXT_REFCOUNTED;
> + ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
> + &ref_tree->rf_ci,
> + ref_root_bh,
> + tpos, p_cluster,
> + num_clusters,
> + ext_flags,
> + dealloc);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock_refcount;
> + }
> +
> + ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> + brelse(ref_root_bh);
> +next_loop:
> + spos += num_clusters;
> + tpos += num_clusters;
> + }
> +
> +out:
> + return ret;
> +out_unlock_refcount:
> + ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> + brelse(ref_root_bh);
> + return ret;
> +}
> +
> +/* Set up refcount tree and remap s_inode to t_inode. */
> +static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
> + struct buffer_head *s_bh,
> + loff_t pos_in,
> + struct inode *t_inode,
> + struct buffer_head *t_bh,
> + loff_t pos_out,
> + loff_t len)
> +{
> + struct ocfs2_cached_dealloc_ctxt dealloc;
> + struct ocfs2_super *osb;
> + struct ocfs2_dinode *dis;
> + struct ocfs2_dinode *dit;
> + int ret;
> +
> + osb = OCFS2_SB(s_inode->i_sb);
> + dis = (struct ocfs2_dinode *)s_bh->b_data;
> + dit = (struct ocfs2_dinode *)t_bh->b_data;
> + ocfs2_init_dealloc_ctxt(&dealloc);
> +
> + /*
> + * If both inodes belong to two different refcount groups then
> + * forget it because we don't know how (or want) to go merging
> + * refcount trees.
> + */
> + ret = -EOPNOTSUPP;
> + if (ocfs2_is_refcount_inode(s_inode) &&
> + ocfs2_is_refcount_inode(t_inode) &&
> + le64_to_cpu(dis->i_refcount_loc) !=
> + le64_to_cpu(dit->i_refcount_loc))
> + goto out;
> +
> + /* Neither inode has a refcount tree. Add one to s_inode. */
> + if (!ocfs2_is_refcount_inode(s_inode) &&
> + !ocfs2_is_refcount_inode(t_inode)) {
> + ret = ocfs2_create_refcount_tree(s_inode, s_bh);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> + }
> +
> + /* Ensure that both inodes end up with the same refcount tree. */
> + if (!ocfs2_is_refcount_inode(s_inode)) {
> + ret = ocfs2_set_refcount_tree(s_inode, s_bh,
> + le64_to_cpu(dit->i_refcount_loc));
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> + }
> + if (!ocfs2_is_refcount_inode(t_inode)) {
> + ret = ocfs2_set_refcount_tree(t_inode, t_bh,
> + le64_to_cpu(dis->i_refcount_loc));
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> + }
> +
> + /*
> + * If we're reflinking the entire file and the source is inline
> + * data, just copy the contents.
> + */
> + if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
> + i_size_read(t_inode) <= len &&
> + (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
> + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
> + if (ret)
> + mlog_errno(ret);
> + goto out;
> + }
> +
> + ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
> + pos_out, len, &dealloc);
> + if (ret) {
> + mlog_errno(ret);
> + goto out;
> + }
> +
> +out:
> + if (ocfs2_dealloc_has_cluster(&dealloc)) {
> + ocfs2_schedule_truncate_log_flush(osb, 1);
> + ocfs2_run_deallocs(osb, &dealloc);
> + }
> +
> + return ret;
> +}
> +
> +/* Lock an inode and grab a bh pointing to the inode. */
> +static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
> + struct buffer_head **bh1,
> + struct inode *t_inode,
> + struct buffer_head **bh2)
> +{
> + struct inode *inode1;
> + struct inode *inode2;
> + struct ocfs2_inode_info *oi1;
> + struct ocfs2_inode_info *oi2;
> + bool same_inode = (s_inode == t_inode);
> + int status;
> +
> + /* First grab the VFS and rw locks. */
> + inode1 = s_inode;
> + inode2 = t_inode;
> + if (inode1->i_ino > inode2->i_ino)
> + swap(inode1, inode2);
> +
> + inode_lock(inode1);
> + status = ocfs2_rw_lock(inode1, 1);
> + if (status) {
> + mlog_errno(status);
> + goto out_i1;
> + }
> + if (!same_inode) {
> + inode_lock_nested(inode2, I_MUTEX_CHILD);
> + status = ocfs2_rw_lock(inode2, 1);
> + if (status) {
> + mlog_errno(status);
> + goto out_i2;
> + }
> + }
> +
> + /* Now go for the cluster locks */
> + oi1 = OCFS2_I(inode1);
> + oi2 = OCFS2_I(inode2);
> +
> + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
> + (unsigned long long)oi2->ip_blkno);
> +
> + if (*bh1)
> + *bh1 = NULL;
> + if (*bh2)
> + *bh2 = NULL;
> +
> + /* We always want to lock the one with the lower lockid first. */
> + if (oi1->ip_blkno > oi2->ip_blkno)
> + mlog_errno(-ENOLCK);
> +
> + /* lock id1 */
> + status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
> + if (status < 0) {
> + if (status != -ENOENT)
> + mlog_errno(status);
> + goto out_rw2;
> + }
> +
> + /* lock id2 */
> + if (!same_inode) {
> + status = ocfs2_inode_lock_nested(inode2, bh2, 1,
> + OI_LS_REFLINK_TARGET);
> + if (status < 0) {
> + if (status != -ENOENT)
> + mlog_errno(status);
> + goto out_cl1;
> + }
> + } else
> + *bh2 = *bh1;
> +
> + trace_ocfs2_double_lock_end(
> + (unsigned long long)OCFS2_I(inode1)->ip_blkno,
> + (unsigned long long)OCFS2_I(inode2)->ip_blkno);
> +
> + return 0;
> +
> +out_cl1:
> + ocfs2_inode_unlock(inode1, 1);
> + brelse(*bh1);
> + *bh1 = NULL;
> +out_rw2:
> + ocfs2_rw_unlock(inode2, 1);
> +out_i2:
> + inode_unlock(inode2);
> + ocfs2_rw_unlock(inode1, 1);
> +out_i1:
> + inode_unlock(inode1);
> + return status;
> +}
> +
> +/* Unlock both inodes and release buffers. */
> +static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
> + struct buffer_head *s_bh,
> + struct inode *t_inode,
> + struct buffer_head *t_bh)
> +{
> + ocfs2_inode_unlock(s_inode, 1);
> + ocfs2_rw_unlock(s_inode, 1);
> + inode_unlock(s_inode);
> + brelse(s_bh);
> +
> + if (s_inode == t_inode)
> + return;
> +
> + ocfs2_inode_unlock(t_inode, 1);
> + ocfs2_rw_unlock(t_inode, 1);
> + inode_unlock(t_inode);
> + brelse(t_bh);
> +}
> +
> +/*
> + * Read a page's worth of file data into the page cache. Return the page
> + * locked.
> + */
> +static struct page *ocfs2_reflink_get_page(struct inode *inode,
> + loff_t offset)
> +{
> + struct address_space *mapping;
> + struct page *page;
> + pgoff_t n;
> +
> + n = offset >> PAGE_SHIFT;
> + mapping = inode->i_mapping;
> + page = read_mapping_page(mapping, n, NULL);
> + if (IS_ERR(page))
> + return page;
> + if (!PageUptodate(page)) {
> + put_page(page);
> + return ERR_PTR(-EIO);
> + }
> + lock_page(page);
> + return page;
> +}
> +
> +/*
> + * Compare extents of two files to see if they are the same.
> + */
> +static int ocfs2_reflink_compare_extents(struct inode *src,
> + loff_t srcoff,
> + struct inode *dest,
> + loff_t destoff,
> + loff_t len,
> + bool *is_same)
> +{
> + loff_t src_poff;
> + loff_t dest_poff;
> + void *src_addr;
> + void *dest_addr;
> + struct page *src_page;
> + struct page *dest_page;
> + loff_t cmp_len;
> + bool same;
> + int error;
> +
> + error = -EINVAL;
> + same = true;
> + while (len) {
> + src_poff = srcoff & (PAGE_SIZE - 1);
> + dest_poff = destoff & (PAGE_SIZE - 1);
> + cmp_len = min(PAGE_SIZE - src_poff,
> + PAGE_SIZE - dest_poff);
> + cmp_len = min(cmp_len, len);
> + if (cmp_len <= 0) {
> + mlog_errno(-EUCLEAN);
> + goto out_error;
> + }
> +
> + src_page = ocfs2_reflink_get_page(src, srcoff);
> + if (IS_ERR(src_page)) {
> + error = PTR_ERR(src_page);
> + goto out_error;
> + }
> + dest_page = ocfs2_reflink_get_page(dest, destoff);
> + if (IS_ERR(dest_page)) {
> + error = PTR_ERR(dest_page);
> + unlock_page(src_page);
> + put_page(src_page);
> + goto out_error;
> + }
> + src_addr = kmap_atomic(src_page);
> + dest_addr = kmap_atomic(dest_page);
> +
> + flush_dcache_page(src_page);
> + flush_dcache_page(dest_page);
> +
> + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
> + same = false;
> +
> + kunmap_atomic(dest_addr);
> + kunmap_atomic(src_addr);
> + unlock_page(dest_page);
> + unlock_page(src_page);
> + put_page(dest_page);
> + put_page(src_page);
> +
> + if (!same)
> + break;
> +
> + srcoff += cmp_len;
> + destoff += cmp_len;
> + len -= cmp_len;
> + }
> +
> + *is_same = same;
> + return 0;
> +
> +out_error:
> + return error;
> +}
> +
> +/* Link a range of blocks from one file to another. */
> +int ocfs2_reflink_remap_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + u64 len,
> + bool is_dedupe)
> +{
> + struct inode *inode_in = file_inode(file_in);
> + struct inode *inode_out = file_inode(file_out);
> + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
> + struct buffer_head *in_bh = NULL, *out_bh = NULL;
> + loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
> + bool same_inode = (inode_in == inode_out);
> + bool is_same = false;
> + loff_t isize;
> + ssize_t ret;
> + loff_t blen;
> +
> + if (!ocfs2_refcount_tree(osb))
> + return -EOPNOTSUPP;
> + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> + return -EROFS;
> +
> + /* Lock both files against IO */
> + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
> + if (ret)
> + return ret;
> +
> + ret = -EINVAL;
> + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
> + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
> + goto out_unlock;
> +
> + /* Don't touch certain kinds of inodes */
> + ret = -EPERM;
> + if (IS_IMMUTABLE(inode_out))
> + goto out_unlock;
> +
> + ret = -ETXTBSY;
> + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
> + goto out_unlock;
> +
> + /* Don't reflink dirs, pipes, sockets... */
> + ret = -EISDIR;
> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> + goto out_unlock;
> + ret = -EINVAL;
> + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
> + goto out_unlock;
> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> + goto out_unlock;
> +
> + /* Are we going all the way to the end? */
> + isize = i_size_read(inode_in);
> + if (isize == 0) {
> + ret = 0;
> + goto out_unlock;
> + }
> +
> + if (len == 0)
> + len = isize - pos_in;
> +
> + /* Ensure offsets don't wrap and the input is inside i_size */
> + if (pos_in + len < pos_in || pos_out + len < pos_out ||
> + pos_in + len > isize)
> + goto out_unlock;
> +
> + /* Don't allow dedupe past EOF in the dest file */
> + if (is_dedupe) {
> + loff_t disize;
> +
> + disize = i_size_read(inode_out);
> + if (pos_out >= disize || pos_out + len > disize)
> + goto out_unlock;
> + }
> +
> + /* If we're linking to EOF, continue to the block boundary. */
> + if (pos_in + len == isize)
> + blen = ALIGN(isize, bs) - pos_in;
> + else
> + blen = len;
> +
> + /* Only reflink if we're aligned to block boundaries */
> + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
> + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
> + goto out_unlock;
> +
> + /* Don't allow overlapped reflink within the same file */
> + if (same_inode) {
> + if (pos_out + blen > pos_in && pos_out < pos_in + blen)
> + goto out_unlock;
> + }
> +
> + /* Wait for the completion of any pending IOs on both files */
> + inode_dio_wait(inode_in);
> + if (!same_inode)
> + inode_dio_wait(inode_out);
> +
> + ret = filemap_write_and_wait_range(inode_in->i_mapping,
> + pos_in, pos_in + len - 1);
> + if (ret)
> + goto out_unlock;
> +
> + ret = filemap_write_and_wait_range(inode_out->i_mapping,
> + pos_out, pos_out + len - 1);
> + if (ret)
> + goto out_unlock;
> +
> + /*
> + * Check that the extents are the same.
> + */
> + if (is_dedupe) {
> + ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
> + inode_out, pos_out,
> + len, &is_same);
> + if (ret)
> + goto out_unlock;
> + if (!is_same) {
> + ret = -EBADE;
> + goto out_unlock;
> + }
> + }
> +
> + /* Lock out changes to the allocation maps */
> + down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> + if (!same_inode)
> + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
> + SINGLE_DEPTH_NESTING);
> +
> + /*
> + * Invalidate the page cache so that we can clear any CoW mappings
> + * in the destination file.
> + */
> + truncate_inode_pages_range(&inode_out->i_data, pos_out,
> + PAGE_ALIGN(pos_out + len) - 1);
> +
> + ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
> + out_bh, pos_out, len);
> +
> + up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> + if (!same_inode)
> + up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock;
> + }
> +
> + /*
> + * Empty the extent map so that we may get the right extent
> + * record from the disk.
> + */
> + ocfs2_extent_map_trunc(inode_in, 0);
> + ocfs2_extent_map_trunc(inode_out, 0);
> +
> + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
> + if (ret) {
> + mlog_errno(ret);
> + goto out_unlock;
> + }
> +
> + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> + return 0;
> +
> +out_unlock:
> + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> + return ret;
> +}
> diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
> index 553edfb..c023e88 100644
> --- a/fs/ocfs2/refcounttree.h
> +++ b/fs/ocfs2/refcounttree.h
> @@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> const char __user *oldname,
> const char __user *newname,
> bool preserve);
> +int ocfs2_reflink_remap_range(struct file *file_in,
> + loff_t pos_in,
> + struct file *file_out,
> + loff_t pos_out,
> + u64 len,
> + bool is_dedupe);
> +
> #endif /* OCFS2_REFCOUNTTREE_H */
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
2016-11-11 5:49 ` Eric Ren
@ 2016-11-11 6:20 ` Darrick J. Wong
-1 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2016-11-11 6:20 UTC (permalink / raw)
To: Eric Ren; +Cc: mfasheh, jlbec, linux-fsdevel, ocfs2-devel
On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
> Hi,
>
> A few issues obvious to me:
>
> On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> >Connect the new VFS clone_range, copy_range, and dedupe_range features
> >to the existing reflink capability of ocfs2. Compared to the existing
> >ocfs2 reflink ioctl We have to do things a little differently to support
> >the VFS semantics (we can clone subranges of a file but we don't clone
> >xattrs), but the VFS ioctls are more broadly supported.
>
> How can I test the new ocfs2 reflink (with this patch) manually? What
> commands should I use to do xxx_range things?
See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
The first two were added in xfsprogs 4.3, and copy_range in 4.7.
--D
>
> >
> >Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> >---
> > fs/ocfs2/file.c | 62 ++++-
> > fs/ocfs2/file.h | 3
> > fs/ocfs2/refcounttree.c | 619 +++++++++++++++++++++++++++++++++++++++++++++++
> > fs/ocfs2/refcounttree.h | 7 +
> > 4 files changed, 688 insertions(+), 3 deletions(-)
> >
> >
> >diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> >index 000c234..d5a022d 100644
> >--- a/fs/ocfs2/file.c
> >+++ b/fs/ocfs2/file.c
> >@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
> > *done = ret;
> > }
> >-static int ocfs2_remove_inode_range(struct inode *inode,
> >- struct buffer_head *di_bh, u64 byte_start,
> >- u64 byte_len)
> >+int ocfs2_remove_inode_range(struct inode *inode,
> >+ struct buffer_head *di_bh, u64 byte_start,
> >+ u64 byte_len)
> > {
> > int ret = 0, flags = 0, done = 0, i;
> > u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
> >@@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
> > return offset;
> > }
> >+static ssize_t ocfs2_file_copy_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ size_t len,
> >+ unsigned int flags)
> >+{
> >+ int error;
> >+
> >+ error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> >+ len, false);
> >+ if (error)
> >+ return error;
> >+ return len;
> >+}
> >+
> >+static int ocfs2_file_clone_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ u64 len)
> >+{
> >+ return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> >+ len, false);
> >+}
> >+
> >+#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024)
> >+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
> >+ u64 loff,
> >+ u64 len,
> >+ struct file *dst_file,
> >+ u64 dst_loff)
> >+{
> >+ int error;
> >+
> >+ /*
> >+ * Limit the total length we will dedupe for each operation.
> >+ * This is intended to bound the total time spent in this
> >+ * ioctl to something sane.
> >+ */
> >+ if (len > OCFS2_MAX_DEDUPE_LEN)
> >+ len = OCFS2_MAX_DEDUPE_LEN;
> >+
> >+ error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
> >+ len, true);
> >+ if (error)
> >+ return error;
> >+ return len;
> >+}
> >+
> > const struct inode_operations ocfs2_file_iops = {
> > .setattr = ocfs2_setattr,
> > .getattr = ocfs2_getattr,
> >@@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = {
> > .splice_read = generic_file_splice_read,
> > .splice_write = iter_file_splice_write,
> > .fallocate = ocfs2_fallocate,
> >+ .copy_file_range = ocfs2_file_copy_range,
> >+ .clone_file_range = ocfs2_file_clone_range,
> >+ .dedupe_file_range = ocfs2_file_dedupe_range,
> > };
> > const struct file_operations ocfs2_dops = {
> >@@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
> > .splice_read = generic_file_splice_read,
> > .splice_write = iter_file_splice_write,
> > .fallocate = ocfs2_fallocate,
> >+ .copy_file_range = ocfs2_file_copy_range,
> >+ .clone_file_range = ocfs2_file_clone_range,
> >+ .dedupe_file_range = ocfs2_file_dedupe_range,
> > };
> > const struct file_operations ocfs2_dops_no_plocks = {
> >diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
> >index e8c62f2..897fd9a 100644
> >--- a/fs/ocfs2/file.h
> >+++ b/fs/ocfs2/file.h
> >@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
> > int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
> > size_t count);
> >+int ocfs2_remove_inode_range(struct inode *inode,
> >+ struct buffer_head *di_bh, u64 byte_start,
> >+ u64 byte_len);
> > #endif /* OCFS2_FILE_H */
> >diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> >index d92b6c6..3e2198c 100644
> >--- a/fs/ocfs2/refcounttree.c
> >+++ b/fs/ocfs2/refcounttree.c
> >@@ -34,6 +34,7 @@
> > #include "xattr.h"
> > #include "namei.h"
> > #include "ocfs2_trace.h"
> >+#include "file.h"
> > #include <linux/bio.h>
> > #include <linux/blkdev.h>
> >@@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> > return error;
> > }
> >+
> >+/* Update destination inode size, if necessary. */
> >+static int ocfs2_reflink_update_dest(struct inode *dest,
> >+ struct buffer_head *d_bh,
> >+ loff_t newlen)
> >+{
> >+ handle_t *handle;
> >+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
> >+ int ret;
> >+
> >+ if (newlen <= i_size_read(dest))
> >+ return 0;
> >+
> >+ handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
> >+ OCFS2_INODE_UPDATE_CREDITS);
> >+ if (IS_ERR(handle)) {
> >+ ret = PTR_ERR(handle);
> >+ mlog_errno(ret);
> >+ return ret;
> >+ }
> >+
> >+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
> >+ OCFS2_JOURNAL_ACCESS_WRITE);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_commit;
> >+ }
> >+
> >+ spin_lock(&OCFS2_I(dest)->ip_lock);
> >+ if (newlen > i_size_read(dest)) {
> >+ i_size_write(dest, newlen);
> >+ di->i_size = newlen;
>
> di->i_size = cpu_to_le64(newlen);
>
> >+ }
> >+ spin_unlock(&OCFS2_I(dest)->ip_lock);
> >+
>
> Add ocfs2_update_inode_fsync_trans() here? Looks this function was
> introduced by you to improve efficiency.
> Just want to awake your memory about this, though I don't know about the
> details why it should be.
>
> Eric
>
> >+ ocfs2_journal_dirty(handle, d_bh);
> >+
> >+out_commit:
> >+ ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
> >+ return ret;
> >+}
> >+
> >+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
> >+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
> >+ struct buffer_head *s_bh,
> >+ loff_t pos_in,
> >+ struct inode *t_inode,
> >+ struct buffer_head *t_bh,
> >+ loff_t pos_out,
> >+ loff_t len,
> >+ struct ocfs2_cached_dealloc_ctxt *dealloc)
> >+{
> >+ struct ocfs2_extent_tree s_et;
> >+ struct ocfs2_extent_tree t_et;
> >+ struct ocfs2_dinode *dis;
> >+ struct buffer_head *ref_root_bh = NULL;
> >+ struct ocfs2_refcount_tree *ref_tree;
> >+ struct ocfs2_super *osb;
> >+ loff_t pstart, plen;
> >+ u32 p_cluster, num_clusters, slast, spos, tpos;
> >+ unsigned int ext_flags;
> >+ int ret = 0;
> >+
> >+ osb = OCFS2_SB(s_inode->i_sb);
> >+ dis = (struct ocfs2_dinode *)s_bh->b_data;
> >+ ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
> >+ ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
> >+
> >+ spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
> >+ tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
> >+ slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
> >+
> >+ while (spos < slast) {
> >+ if (fatal_signal_pending(current)) {
> >+ ret = -EINTR;
> >+ goto out;
> >+ }
> >+
> >+ /* Look up the extent. */
> >+ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
> >+ &num_clusters, &ext_flags);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ num_clusters = min_t(u32, num_clusters, slast - spos);
> >+
> >+ /* Punch out the dest range. */
> >+ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
> >+ plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
> >+ ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ if (p_cluster == 0)
> >+ goto next_loop;
> >+
> >+ /* Lock the refcount btree... */
> >+ ret = ocfs2_lock_refcount_tree(osb,
> >+ le64_to_cpu(dis->i_refcount_loc),
> >+ 1, &ref_tree, &ref_root_bh);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ /* Mark s_inode's extent as refcounted. */
> >+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
> >+ ret = ocfs2_add_refcount_flag(s_inode, &s_et,
> >+ &ref_tree->rf_ci,
> >+ ref_root_bh, spos,
> >+ p_cluster, num_clusters,
> >+ dealloc, NULL);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock_refcount;
> >+ }
> >+ }
> >+
> >+ /* Map in the new extent. */
> >+ ext_flags |= OCFS2_EXT_REFCOUNTED;
> >+ ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
> >+ &ref_tree->rf_ci,
> >+ ref_root_bh,
> >+ tpos, p_cluster,
> >+ num_clusters,
> >+ ext_flags,
> >+ dealloc);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock_refcount;
> >+ }
> >+
> >+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> >+ brelse(ref_root_bh);
> >+next_loop:
> >+ spos += num_clusters;
> >+ tpos += num_clusters;
> >+ }
> >+
> >+out:
> >+ return ret;
> >+out_unlock_refcount:
> >+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> >+ brelse(ref_root_bh);
> >+ return ret;
> >+}
> >+
> >+/* Set up refcount tree and remap s_inode to t_inode. */
> >+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
> >+ struct buffer_head *s_bh,
> >+ loff_t pos_in,
> >+ struct inode *t_inode,
> >+ struct buffer_head *t_bh,
> >+ loff_t pos_out,
> >+ loff_t len)
> >+{
> >+ struct ocfs2_cached_dealloc_ctxt dealloc;
> >+ struct ocfs2_super *osb;
> >+ struct ocfs2_dinode *dis;
> >+ struct ocfs2_dinode *dit;
> >+ int ret;
> >+
> >+ osb = OCFS2_SB(s_inode->i_sb);
> >+ dis = (struct ocfs2_dinode *)s_bh->b_data;
> >+ dit = (struct ocfs2_dinode *)t_bh->b_data;
> >+ ocfs2_init_dealloc_ctxt(&dealloc);
> >+
> >+ /*
> >+ * If both inodes belong to two different refcount groups then
> >+ * forget it because we don't know how (or want) to go merging
> >+ * refcount trees.
> >+ */
> >+ ret = -EOPNOTSUPP;
> >+ if (ocfs2_is_refcount_inode(s_inode) &&
> >+ ocfs2_is_refcount_inode(t_inode) &&
> >+ le64_to_cpu(dis->i_refcount_loc) !=
> >+ le64_to_cpu(dit->i_refcount_loc))
> >+ goto out;
> >+
> >+ /* Neither inode has a refcount tree. Add one to s_inode. */
> >+ if (!ocfs2_is_refcount_inode(s_inode) &&
> >+ !ocfs2_is_refcount_inode(t_inode)) {
> >+ ret = ocfs2_create_refcount_tree(s_inode, s_bh);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+ }
> >+
> >+ /* Ensure that both inodes end up with the same refcount tree. */
> >+ if (!ocfs2_is_refcount_inode(s_inode)) {
> >+ ret = ocfs2_set_refcount_tree(s_inode, s_bh,
> >+ le64_to_cpu(dit->i_refcount_loc));
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+ }
> >+ if (!ocfs2_is_refcount_inode(t_inode)) {
> >+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
> >+ le64_to_cpu(dis->i_refcount_loc));
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+ }
> >+
> >+ /*
> >+ * If we're reflinking the entire file and the source is inline
> >+ * data, just copy the contents.
> >+ */
> >+ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
> >+ i_size_read(t_inode) <= len &&
> >+ (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
> >+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
> >+ if (ret)
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
> >+ pos_out, len, &dealloc);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+out:
> >+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
> >+ ocfs2_schedule_truncate_log_flush(osb, 1);
> >+ ocfs2_run_deallocs(osb, &dealloc);
> >+ }
> >+
> >+ return ret;
> >+}
> >+
> >+/* Lock an inode and grab a bh pointing to the inode. */
> >+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
> >+ struct buffer_head **bh1,
> >+ struct inode *t_inode,
> >+ struct buffer_head **bh2)
> >+{
> >+ struct inode *inode1;
> >+ struct inode *inode2;
> >+ struct ocfs2_inode_info *oi1;
> >+ struct ocfs2_inode_info *oi2;
> >+ bool same_inode = (s_inode == t_inode);
> >+ int status;
> >+
> >+ /* First grab the VFS and rw locks. */
> >+ inode1 = s_inode;
> >+ inode2 = t_inode;
> >+ if (inode1->i_ino > inode2->i_ino)
> >+ swap(inode1, inode2);
> >+
> >+ inode_lock(inode1);
> >+ status = ocfs2_rw_lock(inode1, 1);
> >+ if (status) {
> >+ mlog_errno(status);
> >+ goto out_i1;
> >+ }
> >+ if (!same_inode) {
> >+ inode_lock_nested(inode2, I_MUTEX_CHILD);
> >+ status = ocfs2_rw_lock(inode2, 1);
> >+ if (status) {
> >+ mlog_errno(status);
> >+ goto out_i2;
> >+ }
> >+ }
> >+
> >+ /* Now go for the cluster locks */
> >+ oi1 = OCFS2_I(inode1);
> >+ oi2 = OCFS2_I(inode2);
> >+
> >+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
> >+ (unsigned long long)oi2->ip_blkno);
> >+
> >+ if (*bh1)
> >+ *bh1 = NULL;
> >+ if (*bh2)
> >+ *bh2 = NULL;
> >+
> >+ /* We always want to lock the one with the lower lockid first. */
> >+ if (oi1->ip_blkno > oi2->ip_blkno)
> >+ mlog_errno(-ENOLCK);
> >+
> >+ /* lock id1 */
> >+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
> >+ if (status < 0) {
> >+ if (status != -ENOENT)
> >+ mlog_errno(status);
> >+ goto out_rw2;
> >+ }
> >+
> >+ /* lock id2 */
> >+ if (!same_inode) {
> >+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
> >+ OI_LS_REFLINK_TARGET);
> >+ if (status < 0) {
> >+ if (status != -ENOENT)
> >+ mlog_errno(status);
> >+ goto out_cl1;
> >+ }
> >+ } else
> >+ *bh2 = *bh1;
> >+
> >+ trace_ocfs2_double_lock_end(
> >+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
> >+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
> >+
> >+ return 0;
> >+
> >+out_cl1:
> >+ ocfs2_inode_unlock(inode1, 1);
> >+ brelse(*bh1);
> >+ *bh1 = NULL;
> >+out_rw2:
> >+ ocfs2_rw_unlock(inode2, 1);
> >+out_i2:
> >+ inode_unlock(inode2);
> >+ ocfs2_rw_unlock(inode1, 1);
> >+out_i1:
> >+ inode_unlock(inode1);
> >+ return status;
> >+}
> >+
> >+/* Unlock both inodes and release buffers. */
> >+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
> >+ struct buffer_head *s_bh,
> >+ struct inode *t_inode,
> >+ struct buffer_head *t_bh)
> >+{
> >+ ocfs2_inode_unlock(s_inode, 1);
> >+ ocfs2_rw_unlock(s_inode, 1);
> >+ inode_unlock(s_inode);
> >+ brelse(s_bh);
> >+
> >+ if (s_inode == t_inode)
> >+ return;
> >+
> >+ ocfs2_inode_unlock(t_inode, 1);
> >+ ocfs2_rw_unlock(t_inode, 1);
> >+ inode_unlock(t_inode);
> >+ brelse(t_bh);
> >+}
> >+
> >+/*
> >+ * Read a page's worth of file data into the page cache. Return the page
> >+ * locked.
> >+ */
> >+static struct page *ocfs2_reflink_get_page(struct inode *inode,
> >+ loff_t offset)
> >+{
> >+ struct address_space *mapping;
> >+ struct page *page;
> >+ pgoff_t n;
> >+
> >+ n = offset >> PAGE_SHIFT;
> >+ mapping = inode->i_mapping;
> >+ page = read_mapping_page(mapping, n, NULL);
> >+ if (IS_ERR(page))
> >+ return page;
> >+ if (!PageUptodate(page)) {
> >+ put_page(page);
> >+ return ERR_PTR(-EIO);
> >+ }
> >+ lock_page(page);
> >+ return page;
> >+}
> >+
> >+/*
> >+ * Compare extents of two files to see if they are the same.
> >+ */
> >+static int ocfs2_reflink_compare_extents(struct inode *src,
> >+ loff_t srcoff,
> >+ struct inode *dest,
> >+ loff_t destoff,
> >+ loff_t len,
> >+ bool *is_same)
> >+{
> >+ loff_t src_poff;
> >+ loff_t dest_poff;
> >+ void *src_addr;
> >+ void *dest_addr;
> >+ struct page *src_page;
> >+ struct page *dest_page;
> >+ loff_t cmp_len;
> >+ bool same;
> >+ int error;
> >+
> >+ error = -EINVAL;
> >+ same = true;
> >+ while (len) {
> >+ src_poff = srcoff & (PAGE_SIZE - 1);
> >+ dest_poff = destoff & (PAGE_SIZE - 1);
> >+ cmp_len = min(PAGE_SIZE - src_poff,
> >+ PAGE_SIZE - dest_poff);
> >+ cmp_len = min(cmp_len, len);
> >+ if (cmp_len <= 0) {
> >+ mlog_errno(-EUCLEAN);
> >+ goto out_error;
> >+ }
> >+
> >+ src_page = ocfs2_reflink_get_page(src, srcoff);
> >+ if (IS_ERR(src_page)) {
> >+ error = PTR_ERR(src_page);
> >+ goto out_error;
> >+ }
> >+ dest_page = ocfs2_reflink_get_page(dest, destoff);
> >+ if (IS_ERR(dest_page)) {
> >+ error = PTR_ERR(dest_page);
> >+ unlock_page(src_page);
> >+ put_page(src_page);
> >+ goto out_error;
> >+ }
> >+ src_addr = kmap_atomic(src_page);
> >+ dest_addr = kmap_atomic(dest_page);
> >+
> >+ flush_dcache_page(src_page);
> >+ flush_dcache_page(dest_page);
> >+
> >+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
> >+ same = false;
> >+
> >+ kunmap_atomic(dest_addr);
> >+ kunmap_atomic(src_addr);
> >+ unlock_page(dest_page);
> >+ unlock_page(src_page);
> >+ put_page(dest_page);
> >+ put_page(src_page);
> >+
> >+ if (!same)
> >+ break;
> >+
> >+ srcoff += cmp_len;
> >+ destoff += cmp_len;
> >+ len -= cmp_len;
> >+ }
> >+
> >+ *is_same = same;
> >+ return 0;
> >+
> >+out_error:
> >+ return error;
> >+}
> >+
> >+/* Link a range of blocks from one file to another. */
> >+int ocfs2_reflink_remap_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ u64 len,
> >+ bool is_dedupe)
> >+{
> >+ struct inode *inode_in = file_inode(file_in);
> >+ struct inode *inode_out = file_inode(file_out);
> >+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
> >+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
> >+ loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
> >+ bool same_inode = (inode_in == inode_out);
> >+ bool is_same = false;
> >+ loff_t isize;
> >+ ssize_t ret;
> >+ loff_t blen;
> >+
> >+ if (!ocfs2_refcount_tree(osb))
> >+ return -EOPNOTSUPP;
> >+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> >+ return -EROFS;
> >+
> >+ /* Lock both files against IO */
> >+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
> >+ if (ret)
> >+ return ret;
> >+
> >+ ret = -EINVAL;
> >+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
> >+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
> >+ goto out_unlock;
> >+
> >+ /* Don't touch certain kinds of inodes */
> >+ ret = -EPERM;
> >+ if (IS_IMMUTABLE(inode_out))
> >+ goto out_unlock;
> >+
> >+ ret = -ETXTBSY;
> >+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
> >+ goto out_unlock;
> >+
> >+ /* Don't reflink dirs, pipes, sockets... */
> >+ ret = -EISDIR;
> >+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> >+ goto out_unlock;
> >+ ret = -EINVAL;
> >+ if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
> >+ goto out_unlock;
> >+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> >+ goto out_unlock;
> >+
> >+ /* Are we going all the way to the end? */
> >+ isize = i_size_read(inode_in);
> >+ if (isize == 0) {
> >+ ret = 0;
> >+ goto out_unlock;
> >+ }
> >+
> >+ if (len == 0)
> >+ len = isize - pos_in;
> >+
> >+ /* Ensure offsets don't wrap and the input is inside i_size */
> >+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
> >+ pos_in + len > isize)
> >+ goto out_unlock;
> >+
> >+ /* Don't allow dedupe past EOF in the dest file */
> >+ if (is_dedupe) {
> >+ loff_t disize;
> >+
> >+ disize = i_size_read(inode_out);
> >+ if (pos_out >= disize || pos_out + len > disize)
> >+ goto out_unlock;
> >+ }
> >+
> >+ /* If we're linking to EOF, continue to the block boundary. */
> >+ if (pos_in + len == isize)
> >+ blen = ALIGN(isize, bs) - pos_in;
> >+ else
> >+ blen = len;
> >+
> >+ /* Only reflink if we're aligned to block boundaries */
> >+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
> >+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
> >+ goto out_unlock;
> >+
> >+ /* Don't allow overlapped reflink within the same file */
> >+ if (same_inode) {
> >+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
> >+ goto out_unlock;
> >+ }
> >+
> >+ /* Wait for the completion of any pending IOs on both files */
> >+ inode_dio_wait(inode_in);
> >+ if (!same_inode)
> >+ inode_dio_wait(inode_out);
> >+
> >+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
> >+ pos_in, pos_in + len - 1);
> >+ if (ret)
> >+ goto out_unlock;
> >+
> >+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
> >+ pos_out, pos_out + len - 1);
> >+ if (ret)
> >+ goto out_unlock;
> >+
> >+ /*
> >+ * Check that the extents are the same.
> >+ */
> >+ if (is_dedupe) {
> >+ ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
> >+ inode_out, pos_out,
> >+ len, &is_same);
> >+ if (ret)
> >+ goto out_unlock;
> >+ if (!is_same) {
> >+ ret = -EBADE;
> >+ goto out_unlock;
> >+ }
> >+ }
> >+
> >+ /* Lock out changes to the allocation maps */
> >+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> >+ if (!same_inode)
> >+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
> >+ SINGLE_DEPTH_NESTING);
> >+
> >+ /*
> >+ * Invalidate the page cache so that we can clear any CoW mappings
> >+ * in the destination file.
> >+ */
> >+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
> >+ PAGE_ALIGN(pos_out + len) - 1);
> >+
> >+ ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
> >+ out_bh, pos_out, len);
> >+
> >+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> >+ if (!same_inode)
> >+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock;
> >+ }
> >+
> >+ /*
> >+ * Empty the extent map so that we may get the right extent
> >+ * record from the disk.
> >+ */
> >+ ocfs2_extent_map_trunc(inode_in, 0);
> >+ ocfs2_extent_map_trunc(inode_out, 0);
> >+
> >+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock;
> >+ }
> >+
> >+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> >+ return 0;
> >+
> >+out_unlock:
> >+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> >+ return ret;
> >+}
> >diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
> >index 553edfb..c023e88 100644
> >--- a/fs/ocfs2/refcounttree.h
> >+++ b/fs/ocfs2/refcounttree.h
> >@@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> > const char __user *oldname,
> > const char __user *newname,
> > bool preserve);
> >+int ocfs2_reflink_remap_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ u64 len,
> >+ bool is_dedupe);
> >+
> > #endif /* OCFS2_REFCOUNTTREE_H */
> >
> >
> >_______________________________________________
> >Ocfs2-devel mailing list
> >Ocfs2-devel@oss.oracle.com
> >https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> >
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
@ 2016-11-11 6:20 ` Darrick J. Wong
0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2016-11-11 6:20 UTC (permalink / raw)
To: Eric Ren; +Cc: mfasheh, jlbec, linux-fsdevel, ocfs2-devel
On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
> Hi,
>
> A few issues obvious to me:
>
> On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> >Connect the new VFS clone_range, copy_range, and dedupe_range features
> >to the existing reflink capability of ocfs2. Compared to the existing
> >ocfs2 reflink ioctl We have to do things a little differently to support
> >the VFS semantics (we can clone subranges of a file but we don't clone
> >xattrs), but the VFS ioctls are more broadly supported.
>
> How can I test the new ocfs2 reflink (with this patch) manually? What
> commands should I use to do xxx_range things?
See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
The first two were added in xfsprogs 4.3, and copy_range in 4.7.
--D
>
> >
> >Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> >---
> > fs/ocfs2/file.c | 62 ++++-
> > fs/ocfs2/file.h | 3
> > fs/ocfs2/refcounttree.c | 619 +++++++++++++++++++++++++++++++++++++++++++++++
> > fs/ocfs2/refcounttree.h | 7 +
> > 4 files changed, 688 insertions(+), 3 deletions(-)
> >
> >
> >diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> >index 000c234..d5a022d 100644
> >--- a/fs/ocfs2/file.c
> >+++ b/fs/ocfs2/file.c
> >@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
> > *done = ret;
> > }
> >-static int ocfs2_remove_inode_range(struct inode *inode,
> >- struct buffer_head *di_bh, u64 byte_start,
> >- u64 byte_len)
> >+int ocfs2_remove_inode_range(struct inode *inode,
> >+ struct buffer_head *di_bh, u64 byte_start,
> >+ u64 byte_len)
> > {
> > int ret = 0, flags = 0, done = 0, i;
> > u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
> >@@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
> > return offset;
> > }
> >+static ssize_t ocfs2_file_copy_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ size_t len,
> >+ unsigned int flags)
> >+{
> >+ int error;
> >+
> >+ error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> >+ len, false);
> >+ if (error)
> >+ return error;
> >+ return len;
> >+}
> >+
> >+static int ocfs2_file_clone_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ u64 len)
> >+{
> >+ return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> >+ len, false);
> >+}
> >+
> >+#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024)
> >+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
> >+ u64 loff,
> >+ u64 len,
> >+ struct file *dst_file,
> >+ u64 dst_loff)
> >+{
> >+ int error;
> >+
> >+ /*
> >+ * Limit the total length we will dedupe for each operation.
> >+ * This is intended to bound the total time spent in this
> >+ * ioctl to something sane.
> >+ */
> >+ if (len > OCFS2_MAX_DEDUPE_LEN)
> >+ len = OCFS2_MAX_DEDUPE_LEN;
> >+
> >+ error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
> >+ len, true);
> >+ if (error)
> >+ return error;
> >+ return len;
> >+}
> >+
> > const struct inode_operations ocfs2_file_iops = {
> > .setattr = ocfs2_setattr,
> > .getattr = ocfs2_getattr,
> >@@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = {
> > .splice_read = generic_file_splice_read,
> > .splice_write = iter_file_splice_write,
> > .fallocate = ocfs2_fallocate,
> >+ .copy_file_range = ocfs2_file_copy_range,
> >+ .clone_file_range = ocfs2_file_clone_range,
> >+ .dedupe_file_range = ocfs2_file_dedupe_range,
> > };
> > const struct file_operations ocfs2_dops = {
> >@@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
> > .splice_read = generic_file_splice_read,
> > .splice_write = iter_file_splice_write,
> > .fallocate = ocfs2_fallocate,
> >+ .copy_file_range = ocfs2_file_copy_range,
> >+ .clone_file_range = ocfs2_file_clone_range,
> >+ .dedupe_file_range = ocfs2_file_dedupe_range,
> > };
> > const struct file_operations ocfs2_dops_no_plocks = {
> >diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
> >index e8c62f2..897fd9a 100644
> >--- a/fs/ocfs2/file.h
> >+++ b/fs/ocfs2/file.h
> >@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
> > int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
> > size_t count);
> >+int ocfs2_remove_inode_range(struct inode *inode,
> >+ struct buffer_head *di_bh, u64 byte_start,
> >+ u64 byte_len);
> > #endif /* OCFS2_FILE_H */
> >diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> >index d92b6c6..3e2198c 100644
> >--- a/fs/ocfs2/refcounttree.c
> >+++ b/fs/ocfs2/refcounttree.c
> >@@ -34,6 +34,7 @@
> > #include "xattr.h"
> > #include "namei.h"
> > #include "ocfs2_trace.h"
> >+#include "file.h"
> > #include <linux/bio.h>
> > #include <linux/blkdev.h>
> >@@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> > return error;
> > }
> >+
> >+/* Update destination inode size, if necessary. */
> >+static int ocfs2_reflink_update_dest(struct inode *dest,
> >+ struct buffer_head *d_bh,
> >+ loff_t newlen)
> >+{
> >+ handle_t *handle;
> >+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data;
> >+ int ret;
> >+
> >+ if (newlen <= i_size_read(dest))
> >+ return 0;
> >+
> >+ handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
> >+ OCFS2_INODE_UPDATE_CREDITS);
> >+ if (IS_ERR(handle)) {
> >+ ret = PTR_ERR(handle);
> >+ mlog_errno(ret);
> >+ return ret;
> >+ }
> >+
> >+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh,
> >+ OCFS2_JOURNAL_ACCESS_WRITE);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_commit;
> >+ }
> >+
> >+ spin_lock(&OCFS2_I(dest)->ip_lock);
> >+ if (newlen > i_size_read(dest)) {
> >+ i_size_write(dest, newlen);
> >+ di->i_size = newlen;
>
> di->i_size = cpu_to_le64(newlen);
>
> >+ }
> >+ spin_unlock(&OCFS2_I(dest)->ip_lock);
> >+
>
> Add ocfs2_update_inode_fsync_trans() here? Looks this function was
> introduced by you to improve efficiency.
> Just want to awake your memory about this, though I don't know about the
> details why it should be.
>
> Eric
>
> >+ ocfs2_journal_dirty(handle, d_bh);
> >+
> >+out_commit:
> >+ ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
> >+ return ret;
> >+}
> >+
> >+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
> >+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
> >+ struct buffer_head *s_bh,
> >+ loff_t pos_in,
> >+ struct inode *t_inode,
> >+ struct buffer_head *t_bh,
> >+ loff_t pos_out,
> >+ loff_t len,
> >+ struct ocfs2_cached_dealloc_ctxt *dealloc)
> >+{
> >+ struct ocfs2_extent_tree s_et;
> >+ struct ocfs2_extent_tree t_et;
> >+ struct ocfs2_dinode *dis;
> >+ struct buffer_head *ref_root_bh = NULL;
> >+ struct ocfs2_refcount_tree *ref_tree;
> >+ struct ocfs2_super *osb;
> >+ loff_t pstart, plen;
> >+ u32 p_cluster, num_clusters, slast, spos, tpos;
> >+ unsigned int ext_flags;
> >+ int ret = 0;
> >+
> >+ osb = OCFS2_SB(s_inode->i_sb);
> >+ dis = (struct ocfs2_dinode *)s_bh->b_data;
> >+ ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
> >+ ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
> >+
> >+ spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
> >+ tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
> >+ slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
> >+
> >+ while (spos < slast) {
> >+ if (fatal_signal_pending(current)) {
> >+ ret = -EINTR;
> >+ goto out;
> >+ }
> >+
> >+ /* Look up the extent. */
> >+ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
> >+ &num_clusters, &ext_flags);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ num_clusters = min_t(u32, num_clusters, slast - spos);
> >+
> >+ /* Punch out the dest range. */
> >+ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
> >+ plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
> >+ ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ if (p_cluster == 0)
> >+ goto next_loop;
> >+
> >+ /* Lock the refcount btree... */
> >+ ret = ocfs2_lock_refcount_tree(osb,
> >+ le64_to_cpu(dis->i_refcount_loc),
> >+ 1, &ref_tree, &ref_root_bh);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ /* Mark s_inode's extent as refcounted. */
> >+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
> >+ ret = ocfs2_add_refcount_flag(s_inode, &s_et,
> >+ &ref_tree->rf_ci,
> >+ ref_root_bh, spos,
> >+ p_cluster, num_clusters,
> >+ dealloc, NULL);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock_refcount;
> >+ }
> >+ }
> >+
> >+ /* Map in the new extent. */
> >+ ext_flags |= OCFS2_EXT_REFCOUNTED;
> >+ ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
> >+ &ref_tree->rf_ci,
> >+ ref_root_bh,
> >+ tpos, p_cluster,
> >+ num_clusters,
> >+ ext_flags,
> >+ dealloc);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock_refcount;
> >+ }
> >+
> >+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> >+ brelse(ref_root_bh);
> >+next_loop:
> >+ spos += num_clusters;
> >+ tpos += num_clusters;
> >+ }
> >+
> >+out:
> >+ return ret;
> >+out_unlock_refcount:
> >+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
> >+ brelse(ref_root_bh);
> >+ return ret;
> >+}
> >+
> >+/* Set up refcount tree and remap s_inode to t_inode. */
> >+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
> >+ struct buffer_head *s_bh,
> >+ loff_t pos_in,
> >+ struct inode *t_inode,
> >+ struct buffer_head *t_bh,
> >+ loff_t pos_out,
> >+ loff_t len)
> >+{
> >+ struct ocfs2_cached_dealloc_ctxt dealloc;
> >+ struct ocfs2_super *osb;
> >+ struct ocfs2_dinode *dis;
> >+ struct ocfs2_dinode *dit;
> >+ int ret;
> >+
> >+ osb = OCFS2_SB(s_inode->i_sb);
> >+ dis = (struct ocfs2_dinode *)s_bh->b_data;
> >+ dit = (struct ocfs2_dinode *)t_bh->b_data;
> >+ ocfs2_init_dealloc_ctxt(&dealloc);
> >+
> >+ /*
> >+ * If both inodes belong to two different refcount groups then
> >+ * forget it because we don't know how (or want) to go merging
> >+ * refcount trees.
> >+ */
> >+ ret = -EOPNOTSUPP;
> >+ if (ocfs2_is_refcount_inode(s_inode) &&
> >+ ocfs2_is_refcount_inode(t_inode) &&
> >+ le64_to_cpu(dis->i_refcount_loc) !=
> >+ le64_to_cpu(dit->i_refcount_loc))
> >+ goto out;
> >+
> >+ /* Neither inode has a refcount tree. Add one to s_inode. */
> >+ if (!ocfs2_is_refcount_inode(s_inode) &&
> >+ !ocfs2_is_refcount_inode(t_inode)) {
> >+ ret = ocfs2_create_refcount_tree(s_inode, s_bh);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+ }
> >+
> >+ /* Ensure that both inodes end up with the same refcount tree. */
> >+ if (!ocfs2_is_refcount_inode(s_inode)) {
> >+ ret = ocfs2_set_refcount_tree(s_inode, s_bh,
> >+ le64_to_cpu(dit->i_refcount_loc));
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+ }
> >+ if (!ocfs2_is_refcount_inode(t_inode)) {
> >+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
> >+ le64_to_cpu(dis->i_refcount_loc));
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+ }
> >+
> >+ /*
> >+ * If we're reflinking the entire file and the source is inline
> >+ * data, just copy the contents.
> >+ */
> >+ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
> >+ i_size_read(t_inode) <= len &&
> >+ (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
> >+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
> >+ if (ret)
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
> >+ pos_out, len, &dealloc);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out;
> >+ }
> >+
> >+out:
> >+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
> >+ ocfs2_schedule_truncate_log_flush(osb, 1);
> >+ ocfs2_run_deallocs(osb, &dealloc);
> >+ }
> >+
> >+ return ret;
> >+}
> >+
> >+/* Lock an inode and grab a bh pointing to the inode. */
> >+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
> >+ struct buffer_head **bh1,
> >+ struct inode *t_inode,
> >+ struct buffer_head **bh2)
> >+{
> >+ struct inode *inode1;
> >+ struct inode *inode2;
> >+ struct ocfs2_inode_info *oi1;
> >+ struct ocfs2_inode_info *oi2;
> >+ bool same_inode = (s_inode == t_inode);
> >+ int status;
> >+
> >+ /* First grab the VFS and rw locks. */
> >+ inode1 = s_inode;
> >+ inode2 = t_inode;
> >+ if (inode1->i_ino > inode2->i_ino)
> >+ swap(inode1, inode2);
> >+
> >+ inode_lock(inode1);
> >+ status = ocfs2_rw_lock(inode1, 1);
> >+ if (status) {
> >+ mlog_errno(status);
> >+ goto out_i1;
> >+ }
> >+ if (!same_inode) {
> >+ inode_lock_nested(inode2, I_MUTEX_CHILD);
> >+ status = ocfs2_rw_lock(inode2, 1);
> >+ if (status) {
> >+ mlog_errno(status);
> >+ goto out_i2;
> >+ }
> >+ }
> >+
> >+ /* Now go for the cluster locks */
> >+ oi1 = OCFS2_I(inode1);
> >+ oi2 = OCFS2_I(inode2);
> >+
> >+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
> >+ (unsigned long long)oi2->ip_blkno);
> >+
> >+ if (*bh1)
> >+ *bh1 = NULL;
> >+ if (*bh2)
> >+ *bh2 = NULL;
> >+
> >+ /* We always want to lock the one with the lower lockid first. */
> >+ if (oi1->ip_blkno > oi2->ip_blkno)
> >+ mlog_errno(-ENOLCK);
> >+
> >+ /* lock id1 */
> >+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
> >+ if (status < 0) {
> >+ if (status != -ENOENT)
> >+ mlog_errno(status);
> >+ goto out_rw2;
> >+ }
> >+
> >+ /* lock id2 */
> >+ if (!same_inode) {
> >+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
> >+ OI_LS_REFLINK_TARGET);
> >+ if (status < 0) {
> >+ if (status != -ENOENT)
> >+ mlog_errno(status);
> >+ goto out_cl1;
> >+ }
> >+ } else
> >+ *bh2 = *bh1;
> >+
> >+ trace_ocfs2_double_lock_end(
> >+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
> >+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
> >+
> >+ return 0;
> >+
> >+out_cl1:
> >+ ocfs2_inode_unlock(inode1, 1);
> >+ brelse(*bh1);
> >+ *bh1 = NULL;
> >+out_rw2:
> >+ ocfs2_rw_unlock(inode2, 1);
> >+out_i2:
> >+ inode_unlock(inode2);
> >+ ocfs2_rw_unlock(inode1, 1);
> >+out_i1:
> >+ inode_unlock(inode1);
> >+ return status;
> >+}
> >+
> >+/* Unlock both inodes and release buffers. */
> >+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
> >+ struct buffer_head *s_bh,
> >+ struct inode *t_inode,
> >+ struct buffer_head *t_bh)
> >+{
> >+ ocfs2_inode_unlock(s_inode, 1);
> >+ ocfs2_rw_unlock(s_inode, 1);
> >+ inode_unlock(s_inode);
> >+ brelse(s_bh);
> >+
> >+ if (s_inode == t_inode)
> >+ return;
> >+
> >+ ocfs2_inode_unlock(t_inode, 1);
> >+ ocfs2_rw_unlock(t_inode, 1);
> >+ inode_unlock(t_inode);
> >+ brelse(t_bh);
> >+}
> >+
> >+/*
> >+ * Read a page's worth of file data into the page cache. Return the page
> >+ * locked.
> >+ */
> >+static struct page *ocfs2_reflink_get_page(struct inode *inode,
> >+ loff_t offset)
> >+{
> >+ struct address_space *mapping;
> >+ struct page *page;
> >+ pgoff_t n;
> >+
> >+ n = offset >> PAGE_SHIFT;
> >+ mapping = inode->i_mapping;
> >+ page = read_mapping_page(mapping, n, NULL);
> >+ if (IS_ERR(page))
> >+ return page;
> >+ if (!PageUptodate(page)) {
> >+ put_page(page);
> >+ return ERR_PTR(-EIO);
> >+ }
> >+ lock_page(page);
> >+ return page;
> >+}
> >+
> >+/*
> >+ * Compare extents of two files to see if they are the same.
> >+ */
> >+static int ocfs2_reflink_compare_extents(struct inode *src,
> >+ loff_t srcoff,
> >+ struct inode *dest,
> >+ loff_t destoff,
> >+ loff_t len,
> >+ bool *is_same)
> >+{
> >+ loff_t src_poff;
> >+ loff_t dest_poff;
> >+ void *src_addr;
> >+ void *dest_addr;
> >+ struct page *src_page;
> >+ struct page *dest_page;
> >+ loff_t cmp_len;
> >+ bool same;
> >+ int error;
> >+
> >+ error = -EINVAL;
> >+ same = true;
> >+ while (len) {
> >+ src_poff = srcoff & (PAGE_SIZE - 1);
> >+ dest_poff = destoff & (PAGE_SIZE - 1);
> >+ cmp_len = min(PAGE_SIZE - src_poff,
> >+ PAGE_SIZE - dest_poff);
> >+ cmp_len = min(cmp_len, len);
> >+ if (cmp_len <= 0) {
> >+ mlog_errno(-EUCLEAN);
> >+ goto out_error;
> >+ }
> >+
> >+ src_page = ocfs2_reflink_get_page(src, srcoff);
> >+ if (IS_ERR(src_page)) {
> >+ error = PTR_ERR(src_page);
> >+ goto out_error;
> >+ }
> >+ dest_page = ocfs2_reflink_get_page(dest, destoff);
> >+ if (IS_ERR(dest_page)) {
> >+ error = PTR_ERR(dest_page);
> >+ unlock_page(src_page);
> >+ put_page(src_page);
> >+ goto out_error;
> >+ }
> >+ src_addr = kmap_atomic(src_page);
> >+ dest_addr = kmap_atomic(dest_page);
> >+
> >+ flush_dcache_page(src_page);
> >+ flush_dcache_page(dest_page);
> >+
> >+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
> >+ same = false;
> >+
> >+ kunmap_atomic(dest_addr);
> >+ kunmap_atomic(src_addr);
> >+ unlock_page(dest_page);
> >+ unlock_page(src_page);
> >+ put_page(dest_page);
> >+ put_page(src_page);
> >+
> >+ if (!same)
> >+ break;
> >+
> >+ srcoff += cmp_len;
> >+ destoff += cmp_len;
> >+ len -= cmp_len;
> >+ }
> >+
> >+ *is_same = same;
> >+ return 0;
> >+
> >+out_error:
> >+ return error;
> >+}
> >+
> >+/* Link a range of blocks from one file to another. */
> >+int ocfs2_reflink_remap_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ u64 len,
> >+ bool is_dedupe)
> >+{
> >+ struct inode *inode_in = file_inode(file_in);
> >+ struct inode *inode_out = file_inode(file_out);
> >+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
> >+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
> >+ loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
> >+ bool same_inode = (inode_in == inode_out);
> >+ bool is_same = false;
> >+ loff_t isize;
> >+ ssize_t ret;
> >+ loff_t blen;
> >+
> >+ if (!ocfs2_refcount_tree(osb))
> >+ return -EOPNOTSUPP;
> >+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
> >+ return -EROFS;
> >+
> >+ /* Lock both files against IO */
> >+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
> >+ if (ret)
> >+ return ret;
> >+
> >+ ret = -EINVAL;
> >+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
> >+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
> >+ goto out_unlock;
> >+
> >+ /* Don't touch certain kinds of inodes */
> >+ ret = -EPERM;
> >+ if (IS_IMMUTABLE(inode_out))
> >+ goto out_unlock;
> >+
> >+ ret = -ETXTBSY;
> >+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
> >+ goto out_unlock;
> >+
> >+ /* Don't reflink dirs, pipes, sockets... */
> >+ ret = -EISDIR;
> >+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> >+ goto out_unlock;
> >+ ret = -EINVAL;
> >+ if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
> >+ goto out_unlock;
> >+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> >+ goto out_unlock;
> >+
> >+ /* Are we going all the way to the end? */
> >+ isize = i_size_read(inode_in);
> >+ if (isize == 0) {
> >+ ret = 0;
> >+ goto out_unlock;
> >+ }
> >+
> >+ if (len == 0)
> >+ len = isize - pos_in;
> >+
> >+ /* Ensure offsets don't wrap and the input is inside i_size */
> >+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
> >+ pos_in + len > isize)
> >+ goto out_unlock;
> >+
> >+ /* Don't allow dedupe past EOF in the dest file */
> >+ if (is_dedupe) {
> >+ loff_t disize;
> >+
> >+ disize = i_size_read(inode_out);
> >+ if (pos_out >= disize || pos_out + len > disize)
> >+ goto out_unlock;
> >+ }
> >+
> >+ /* If we're linking to EOF, continue to the block boundary. */
> >+ if (pos_in + len == isize)
> >+ blen = ALIGN(isize, bs) - pos_in;
> >+ else
> >+ blen = len;
> >+
> >+ /* Only reflink if we're aligned to block boundaries */
> >+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
> >+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
> >+ goto out_unlock;
> >+
> >+ /* Don't allow overlapped reflink within the same file */
> >+ if (same_inode) {
> >+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
> >+ goto out_unlock;
> >+ }
> >+
> >+ /* Wait for the completion of any pending IOs on both files */
> >+ inode_dio_wait(inode_in);
> >+ if (!same_inode)
> >+ inode_dio_wait(inode_out);
> >+
> >+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
> >+ pos_in, pos_in + len - 1);
> >+ if (ret)
> >+ goto out_unlock;
> >+
> >+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
> >+ pos_out, pos_out + len - 1);
> >+ if (ret)
> >+ goto out_unlock;
> >+
> >+ /*
> >+ * Check that the extents are the same.
> >+ */
> >+ if (is_dedupe) {
> >+ ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
> >+ inode_out, pos_out,
> >+ len, &is_same);
> >+ if (ret)
> >+ goto out_unlock;
> >+ if (!is_same) {
> >+ ret = -EBADE;
> >+ goto out_unlock;
> >+ }
> >+ }
> >+
> >+ /* Lock out changes to the allocation maps */
> >+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> >+ if (!same_inode)
> >+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
> >+ SINGLE_DEPTH_NESTING);
> >+
> >+ /*
> >+ * Invalidate the page cache so that we can clear any CoW mappings
> >+ * in the destination file.
> >+ */
> >+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
> >+ PAGE_ALIGN(pos_out + len) - 1);
> >+
> >+ ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
> >+ out_bh, pos_out, len);
> >+
> >+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
> >+ if (!same_inode)
> >+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock;
> >+ }
> >+
> >+ /*
> >+ * Empty the extent map so that we may get the right extent
> >+ * record from the disk.
> >+ */
> >+ ocfs2_extent_map_trunc(inode_in, 0);
> >+ ocfs2_extent_map_trunc(inode_out, 0);
> >+
> >+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
> >+ if (ret) {
> >+ mlog_errno(ret);
> >+ goto out_unlock;
> >+ }
> >+
> >+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> >+ return 0;
> >+
> >+out_unlock:
> >+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
> >+ return ret;
> >+}
> >diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
> >index 553edfb..c023e88 100644
> >--- a/fs/ocfs2/refcounttree.h
> >+++ b/fs/ocfs2/refcounttree.h
> >@@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
> > const char __user *oldname,
> > const char __user *newname,
> > bool preserve);
> >+int ocfs2_reflink_remap_range(struct file *file_in,
> >+ loff_t pos_in,
> >+ struct file *file_out,
> >+ loff_t pos_out,
> >+ u64 len,
> >+ bool is_dedupe);
> >+
> > #endif /* OCFS2_REFCOUNTTREE_H */
> >
> >
> >_______________________________________________
> >Ocfs2-devel mailing list
> >Ocfs2-devel at oss.oracle.com
> >https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> >
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
2016-11-11 6:20 ` Darrick J. Wong
@ 2016-11-11 6:45 ` Eric Ren
-1 siblings, 0 replies; 42+ messages in thread
From: Eric Ren @ 2016-11-11 6:45 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: mfasheh, jlbec, linux-fsdevel, ocfs2-devel
On 11/11/2016 02:20 PM, Darrick J. Wong wrote:
> On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
>> Hi,
>>
>> A few issues obvious to me:
>>
>> On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
>>> Connect the new VFS clone_range, copy_range, and dedupe_range features
>>> to the existing reflink capability of ocfs2. Compared to the existing
>>> ocfs2 reflink ioctl We have to do things a little differently to support
>>> the VFS semantics (we can clone subranges of a file but we don't clone
>>> xattrs), but the VFS ioctls are more broadly supported.
>> How can I test the new ocfs2 reflink (with this patch) manually? What
>> commands should I use to do xxx_range things?
> See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
>
> The first two were added in xfsprogs 4.3, and copy_range in 4.7.
OK, thanks. I think you are missing the following two inline comments:
>>> + spin_lock(&OCFS2_I(dest)->ip_lock);
>>> + if (newlen > i_size_read(dest)) {
>>> + i_size_write(dest, newlen);
>>> + di->i_size = newlen;
>> di->i_size = cpu_to_le64(newlen);
>>
>>> + }
>>> + spin_unlock(&OCFS2_I(dest)->ip_lock);
>>> +
>> Add ocfs2_update_inode_fsync_trans() here? Looks this function was
>> introduced by you to improve efficiency.
>> Just want to awake your memory about this, though I don't know about the
>> details why it should be.
>>
>> Eric
Thanks,
Eric
^ permalink raw reply [flat|nested] 42+ messages in thread
* [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
@ 2016-11-11 6:45 ` Eric Ren
0 siblings, 0 replies; 42+ messages in thread
From: Eric Ren @ 2016-11-11 6:45 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: mfasheh, jlbec, linux-fsdevel, ocfs2-devel
On 11/11/2016 02:20 PM, Darrick J. Wong wrote:
> On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
>> Hi,
>>
>> A few issues obvious to me:
>>
>> On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
>>> Connect the new VFS clone_range, copy_range, and dedupe_range features
>>> to the existing reflink capability of ocfs2. Compared to the existing
>>> ocfs2 reflink ioctl We have to do things a little differently to support
>>> the VFS semantics (we can clone subranges of a file but we don't clone
>>> xattrs), but the VFS ioctls are more broadly supported.
>> How can I test the new ocfs2 reflink (with this patch) manually? What
>> commands should I use to do xxx_range things?
> See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
>
> The first two were added in xfsprogs 4.3, and copy_range in 4.7.
OK, thanks. I think you are missing the following two inline comments:
>>> + spin_lock(&OCFS2_I(dest)->ip_lock);
>>> + if (newlen > i_size_read(dest)) {
>>> + i_size_write(dest, newlen);
>>> + di->i_size = newlen;
>> di->i_size = cpu_to_le64(newlen);
>>
>>> + }
>>> + spin_unlock(&OCFS2_I(dest)->ip_lock);
>>> +
>> Add ocfs2_update_inode_fsync_trans() here? Looks this function was
>> introduced by you to improve efficiency.
>> Just want to awake your memory about this, though I don't know about the
>> details why it should be.
>>
>> Eric
Thanks,
Eric
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
2016-11-11 6:45 ` Eric Ren
@ 2016-11-11 9:01 ` Darrick J. Wong
-1 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2016-11-11 9:01 UTC (permalink / raw)
To: Eric Ren; +Cc: mfasheh, jlbec, linux-fsdevel, ocfs2-devel
On Fri, Nov 11, 2016 at 02:45:54PM +0800, Eric Ren wrote:
> On 11/11/2016 02:20 PM, Darrick J. Wong wrote:
> >On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
> >>Hi,
> >>
> >>A few issues obvious to me:
> >>
> >>On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> >>>Connect the new VFS clone_range, copy_range, and dedupe_range features
> >>>to the existing reflink capability of ocfs2. Compared to the existing
> >>>ocfs2 reflink ioctl We have to do things a little differently to support
> >>>the VFS semantics (we can clone subranges of a file but we don't clone
> >>>xattrs), but the VFS ioctls are more broadly supported.
> >>How can I test the new ocfs2 reflink (with this patch) manually? What
> >>commands should I use to do xxx_range things?
> >See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
> >
> >The first two were added in xfsprogs 4.3, and copy_range in 4.7.
>
> OK, thanks. I think you are missing the following two inline comments:
>
> >>>+ spin_lock(&OCFS2_I(dest)->ip_lock);
> >>>+ if (newlen > i_size_read(dest)) {
> >>>+ i_size_write(dest, newlen);
> >>>+ di->i_size = newlen;
> >>di->i_size = cpu_to_le64(newlen);
Good catch!
> >>>+ }
> >>>+ spin_unlock(&OCFS2_I(dest)->ip_lock);
> >>>+
> >>Add ocfs2_update_inode_fsync_trans() here? Looks this function was
> >>introduced by you to improve efficiency.
> >>Just want to awake your memory about this, though I don't know about the
> >>details why it should be.
D'oh! Yes, I did miss that.
The function updates the destination inode's information. Specifically,
it updates i_size if we reflinked blocks into the file past EOF.
Looking at it some more, I also need to update i_blocks or the stat(2) info
will be wrong, and I also need to convert inline data to extents prior
to reflinking.
--D
> >>
> >>Eric
> Thanks,
> Eric
^ permalink raw reply [flat|nested] 42+ messages in thread
* [Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
@ 2016-11-11 9:01 ` Darrick J. Wong
0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2016-11-11 9:01 UTC (permalink / raw)
To: Eric Ren; +Cc: mfasheh, jlbec, linux-fsdevel, ocfs2-devel
On Fri, Nov 11, 2016 at 02:45:54PM +0800, Eric Ren wrote:
> On 11/11/2016 02:20 PM, Darrick J. Wong wrote:
> >On Fri, Nov 11, 2016 at 01:49:48PM +0800, Eric Ren wrote:
> >>Hi,
> >>
> >>A few issues obvious to me:
> >>
> >>On 11/10/2016 06:51 AM, Darrick J. Wong wrote:
> >>>Connect the new VFS clone_range, copy_range, and dedupe_range features
> >>>to the existing reflink capability of ocfs2. Compared to the existing
> >>>ocfs2 reflink ioctl We have to do things a little differently to support
> >>>the VFS semantics (we can clone subranges of a file but we don't clone
> >>>xattrs), but the VFS ioctls are more broadly supported.
> >>How can I test the new ocfs2 reflink (with this patch) manually? What
> >>commands should I use to do xxx_range things?
> >See the 'reflink', 'dedupe', and 'copy_range' commands in xfs_io.
> >
> >The first two were added in xfsprogs 4.3, and copy_range in 4.7.
>
> OK, thanks. I think you are missing the following two inline comments:
>
> >>>+ spin_lock(&OCFS2_I(dest)->ip_lock);
> >>>+ if (newlen > i_size_read(dest)) {
> >>>+ i_size_write(dest, newlen);
> >>>+ di->i_size = newlen;
> >>di->i_size = cpu_to_le64(newlen);
Good catch!
> >>>+ }
> >>>+ spin_unlock(&OCFS2_I(dest)->ip_lock);
> >>>+
> >>Add ocfs2_update_inode_fsync_trans() here? Looks this function was
> >>introduced by you to improve efficiency.
> >>Just want to awake your memory about this, though I don't know about the
> >>details why it should be.
D'oh! Yes, I did miss that.
The function updates the destination inode's information. Specifically,
it updates i_size if we reflinked blocks into the file past EOF.
Looking at it some more, I also need to update i_blocks or the stat(2) info
will be wrong, and I also need to convert inline data to extents prior
to reflinking.
--D
> >>
> >>Eric
> Thanks,
> Eric
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v2 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
2016-11-09 22:51 ` [Ocfs2-devel] " Darrick J. Wong
@ 2016-11-11 14:54 ` Darrick J. Wong
-1 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2016-11-11 14:54 UTC (permalink / raw)
To: mfasheh, jlbec, zren; +Cc: linux-fsdevel, ocfs2-devel
Connect the new VFS clone_range, copy_range, and dedupe_range features
to the existing reflink capability of ocfs2. Compared to the existing
ocfs2 reflink ioctl We have to do things a little differently to support
the VFS semantics (we can clone subranges of a file but we don't clone
xattrs), but the VFS ioctls are more broadly supported.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v2: Convert inline data files to extents files before reflinking,
and fix i_blocks so that stat(2) output is correct. fsync the inoe
correctly.
---
fs/ocfs2/file.c | 62 ++++-
fs/ocfs2/file.h | 3
fs/ocfs2/refcounttree.c | 627 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/refcounttree.h | 7 +
4 files changed, 696 insertions(+), 3 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d261f3a..71aad0e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
*done = ret;
}
-static int ocfs2_remove_inode_range(struct inode *inode,
- struct buffer_head *di_bh, u64 byte_start,
- u64 byte_len)
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len)
{
int ret = 0, flags = 0, done = 0, i;
u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -2439,6 +2439,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
+static ssize_t ocfs2_file_copy_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ size_t len,
+ unsigned int flags)
+{
+ int error;
+
+ error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+ if (error)
+ return error;
+ return len;
+}
+
+static int ocfs2_file_clone_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len)
+{
+ return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+}
+
+#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
+ u64 loff,
+ u64 len,
+ struct file *dst_file,
+ u64 dst_loff)
+{
+ int error;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the total time spent in this
+ * ioctl to something sane.
+ */
+ if (len > OCFS2_MAX_DEDUPE_LEN)
+ len = OCFS2_MAX_DEDUPE_LEN;
+
+ error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+ len, true);
+ if (error)
+ return error;
+ return len;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2478,6 +2528,9 @@ const struct file_operations ocfs2_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .copy_file_range = ocfs2_file_copy_range,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops = {
@@ -2523,6 +2576,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .copy_file_range = ocfs2_file_copy_range,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e8c62f2..897fd9a 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
size_t count);
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 6c98d56..be51540 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,7 @@
#include "xattr.h"
#include "namei.h"
#include "ocfs2_trace.h"
+#include "file.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -4441,3 +4442,629 @@ int ocfs2_reflink_ioctl(struct inode *inode,
return error;
}
+
+/* Update destination inode size, if necessary. */
+static int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
+{
+ handle_t *handle;
+ int ret;
+
+ dest->i_blocks = ocfs2_inode_sector_count(dest);
+
+ if (newlen <= i_size_read(dest))
+ return 0;
+
+ handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* Extend i_size if needed. */
+ spin_lock(&OCFS2_I(dest)->ip_lock);
+ if (newlen > i_size_read(dest))
+ i_size_write(dest, newlen);
+ spin_unlock(&OCFS2_I(dest)->ip_lock);
+ dest->i_ctime = dest->i_mtime = current_time(dest);
+
+ ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+ return ret;
+}
+
+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_extent_tree s_et;
+ struct ocfs2_extent_tree t_et;
+ struct ocfs2_dinode *dis;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_super *osb;
+ loff_t pstart, plen;
+ u32 p_cluster, num_clusters, slast, spos, tpos;
+ unsigned int ext_flags;
+ int ret = 0;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+ ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+
+ spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+ tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+ slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+
+ while (spos < slast) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ /* Look up the extent. */
+ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_clusters = min_t(u32, num_clusters, slast - spos);
+
+ /* Punch out the dest range. */
+ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+ plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+ ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (p_cluster == 0)
+ goto next_loop;
+
+ /* Lock the refcount btree... */
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(dis->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Mark s_inode's extent as refcounted. */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, spos,
+ p_cluster, num_clusters,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+ }
+
+ /* Map in the new extent. */
+ ext_flags |= OCFS2_EXT_REFCOUNTED;
+ ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ tpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+next_loop:
+ spos += num_clusters;
+ tpos += num_clusters;
+ }
+
+out:
+ return ret;
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/* Set up refcount tree and remap s_inode to t_inode. */
+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb;
+ struct ocfs2_dinode *dis;
+ struct ocfs2_dinode *dit;
+ int ret;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ dit = (struct ocfs2_dinode *)t_bh->b_data;
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /*
+ * If we're reflinking the entire file and the source is inline
+ * data, just copy the contents.
+ */
+ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+ i_size_read(t_inode) <= len &&
+ (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * If both inodes belong to two different refcount groups then
+ * forget it because we don't know how (or want) to go merging
+ * refcount trees.
+ */
+ ret = -EOPNOTSUPP;
+ if (ocfs2_is_refcount_inode(s_inode) &&
+ ocfs2_is_refcount_inode(t_inode) &&
+ le64_to_cpu(dis->i_refcount_loc) !=
+ le64_to_cpu(dit->i_refcount_loc))
+ goto out;
+
+ /* Neither inode has a refcount tree. Add one to s_inode. */
+ if (!ocfs2_is_refcount_inode(s_inode) &&
+ !ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Ensure that both inodes end up with the same refcount tree. */
+ if (!ocfs2_is_refcount_inode(s_inode)) {
+ ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+ le64_to_cpu(dit->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ if (!ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(dis->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Turn off inline data in the dest file. */
+ if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Actually remap extents now. */
+ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+ pos_out, len, &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+/* Lock an inode and grab a bh pointing to the inode. */
+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2)
+{
+ struct inode *inode1;
+ struct inode *inode2;
+ struct ocfs2_inode_info *oi1;
+ struct ocfs2_inode_info *oi2;
+ bool same_inode = (s_inode == t_inode);
+ int status;
+
+ /* First grab the VFS and rw locks. */
+ inode1 = s_inode;
+ inode2 = t_inode;
+ if (inode1->i_ino > inode2->i_ino)
+ swap(inode1, inode2);
+
+ inode_lock(inode1);
+ status = ocfs2_rw_lock(inode1, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i1;
+ }
+ if (!same_inode) {
+ inode_lock_nested(inode2, I_MUTEX_CHILD);
+ status = ocfs2_rw_lock(inode2, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i2;
+ }
+ }
+
+ /* Now go for the cluster locks */
+ oi1 = OCFS2_I(inode1);
+ oi2 = OCFS2_I(inode2);
+
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
+
+ if (*bh1)
+ *bh1 = NULL;
+ if (*bh2)
+ *bh2 = NULL;
+
+ /* We always want to lock the one with the lower lockid first. */
+ if (oi1->ip_blkno > oi2->ip_blkno)
+ mlog_errno(-ENOLCK);
+
+ /* lock id1 */
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_rw2;
+ }
+
+ /* lock id2 */
+ if (!same_inode) {
+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_cl1;
+ }
+ } else
+ *bh2 = *bh1;
+
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+ return 0;
+
+out_cl1:
+ ocfs2_inode_unlock(inode1, 1);
+ brelse(*bh1);
+ *bh1 = NULL;
+out_rw2:
+ ocfs2_rw_unlock(inode2, 1);
+out_i2:
+ inode_unlock(inode2);
+ ocfs2_rw_unlock(inode1, 1);
+out_i1:
+ inode_unlock(inode1);
+ return status;
+}
+
+/* Unlock both inodes and release buffers. */
+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ ocfs2_inode_unlock(s_inode, 1);
+ ocfs2_rw_unlock(s_inode, 1);
+ inode_unlock(s_inode);
+ brelse(s_bh);
+
+ if (s_inode == t_inode)
+ return;
+
+ ocfs2_inode_unlock(t_inode, 1);
+ ocfs2_rw_unlock(t_inode, 1);
+ inode_unlock(t_inode);
+ brelse(t_bh);
+}
+
+/*
+ * Read a page's worth of file data into the page cache. Return the page
+ * locked.
+ */
+static struct page *ocfs2_reflink_get_page(struct inode *inode,
+ loff_t offset)
+{
+ struct address_space *mapping;
+ struct page *page;
+ pgoff_t n;
+
+ n = offset >> PAGE_SHIFT;
+ mapping = inode->i_mapping;
+ page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ lock_page(page);
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int ocfs2_reflink_compare_extents(struct inode *src,
+ loff_t srcoff,
+ struct inode *dest,
+ loff_t destoff,
+ loff_t len,
+ bool *is_same)
+{
+ loff_t src_poff;
+ loff_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ loff_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ if (cmp_len <= 0) {
+ mlog_errno(-EUCLEAN);
+ goto out_error;
+ }
+
+ src_page = ocfs2_reflink_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = ocfs2_reflink_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ unlock_page(src_page);
+ put_page(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+ unlock_page(dest_page);
+ unlock_page(src_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ return error;
+}
+
+/* Link a range of blocks from one file to another. */
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
+ loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
+ bool same_inode = (inode_in == inode_out);
+ bool is_same = false;
+ loff_t isize;
+ ssize_t ret;
+ loff_t blen;
+
+ if (!ocfs2_refcount_tree(osb))
+ return -EOPNOTSUPP;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ /* Lock both files against IO */
+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+ goto out_unlock;
+
+ /* Don't touch certain kinds of inodes */
+ ret = -EPERM;
+ if (IS_IMMUTABLE(inode_out))
+ goto out_unlock;
+
+ ret = -ETXTBSY;
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ goto out_unlock;
+
+ /* Don't reflink dirs, pipes, sockets... */
+ ret = -EISDIR;
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ goto out_unlock;
+ ret = -EINVAL;
+ if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+ goto out_unlock;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ goto out_unlock;
+
+ /* Are we going all the way to the end? */
+ isize = i_size_read(inode_in);
+ if (isize == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (len == 0)
+ len = isize - pos_in;
+
+ /* Ensure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > isize)
+ goto out_unlock;
+
+ /* Don't allow dedupe past EOF in the dest file */
+ if (is_dedupe) {
+ loff_t disize;
+
+ disize = i_size_read(inode_out);
+ if (pos_out >= disize || pos_out + len > disize)
+ goto out_unlock;
+ }
+
+ /* If we're linking to EOF, continue to the block boundary. */
+ if (pos_in + len == isize)
+ blen = ALIGN(isize, bs) - pos_in;
+ else
+ blen = len;
+
+ /* Only reflink if we're aligned to block boundaries */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+ goto out_unlock;
+
+ /* Don't allow overlapped reflink within the same file */
+ if (same_inode) {
+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+ goto out_unlock;
+ }
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
+ pos_in, pos_in + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Check that the extents are the same.
+ */
+ if (is_dedupe) {
+ ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
+ inode_out, pos_out,
+ len, &is_same);
+ if (ret)
+ goto out_unlock;
+ if (!is_same) {
+ ret = -EBADE;
+ goto out_unlock;
+ }
+ }
+
+ /* Lock out changes to the allocation maps */
+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+ /*
+ * Invalidate the page cache so that we can clear any CoW mappings
+ * in the destination file.
+ */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + len) - 1);
+
+ ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+ out_bh, pos_out, len);
+
+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode_in, 0);
+ ocfs2_extent_map_trunc(inode_out, 0);
+
+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return 0;
+
+out_unlock:
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 6422bbc..4af55bf 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe);
+
#endif /* OCFS2_REFCOUNTTREE_H */
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [Ocfs2-devel] [PATCH v2 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
@ 2016-11-11 14:54 ` Darrick J. Wong
0 siblings, 0 replies; 42+ messages in thread
From: Darrick J. Wong @ 2016-11-11 14:54 UTC (permalink / raw)
To: mfasheh, jlbec, zren; +Cc: linux-fsdevel, ocfs2-devel
Connect the new VFS clone_range, copy_range, and dedupe_range features
to the existing reflink capability of ocfs2. Compared to the existing
ocfs2 reflink ioctl We have to do things a little differently to support
the VFS semantics (we can clone subranges of a file but we don't clone
xattrs), but the VFS ioctls are more broadly supported.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v2: Convert inline data files to extents files before reflinking,
and fix i_blocks so that stat(2) output is correct. fsync the inoe
correctly.
---
fs/ocfs2/file.c | 62 ++++-
fs/ocfs2/file.h | 3
fs/ocfs2/refcounttree.c | 627 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/refcounttree.h | 7 +
4 files changed, 696 insertions(+), 3 deletions(-)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d261f3a..71aad0e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
*done = ret;
}
-static int ocfs2_remove_inode_range(struct inode *inode,
- struct buffer_head *di_bh, u64 byte_start,
- u64 byte_len)
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len)
{
int ret = 0, flags = 0, done = 0, i;
u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -2439,6 +2439,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
+static ssize_t ocfs2_file_copy_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ size_t len,
+ unsigned int flags)
+{
+ int error;
+
+ error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+ if (error)
+ return error;
+ return len;
+}
+
+static int ocfs2_file_clone_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len)
+{
+ return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+}
+
+#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
+ u64 loff,
+ u64 len,
+ struct file *dst_file,
+ u64 dst_loff)
+{
+ int error;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the total time spent in this
+ * ioctl to something sane.
+ */
+ if (len > OCFS2_MAX_DEDUPE_LEN)
+ len = OCFS2_MAX_DEDUPE_LEN;
+
+ error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+ len, true);
+ if (error)
+ return error;
+ return len;
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2478,6 +2528,9 @@ const struct file_operations ocfs2_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .copy_file_range = ocfs2_file_copy_range,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops = {
@@ -2523,6 +2576,9 @@ const struct file_operations ocfs2_fops_no_plocks = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
+ .copy_file_range = ocfs2_file_copy_range,
+ .clone_file_range = ocfs2_file_clone_range,
+ .dedupe_file_range = ocfs2_file_dedupe_range,
};
const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e8c62f2..897fd9a 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
size_t count);
+int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len);
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 6c98d56..be51540 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,7 @@
#include "xattr.h"
#include "namei.h"
#include "ocfs2_trace.h"
+#include "file.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -4441,3 +4442,629 @@ int ocfs2_reflink_ioctl(struct inode *inode,
return error;
}
+
+/* Update destination inode size, if necessary. */
+static int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
+{
+ handle_t *handle;
+ int ret;
+
+ dest->i_blocks = ocfs2_inode_sector_count(dest);
+
+ if (newlen <= i_size_read(dest))
+ return 0;
+
+ handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* Extend i_size if needed. */
+ spin_lock(&OCFS2_I(dest)->ip_lock);
+ if (newlen > i_size_read(dest))
+ i_size_write(dest, newlen);
+ spin_unlock(&OCFS2_I(dest)->ip_lock);
+ dest->i_ctime = dest->i_mtime = current_time(dest);
+
+ ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+ return ret;
+}
+
+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ struct ocfs2_extent_tree s_et;
+ struct ocfs2_extent_tree t_et;
+ struct ocfs2_dinode *dis;
+ struct buffer_head *ref_root_bh = NULL;
+ struct ocfs2_refcount_tree *ref_tree;
+ struct ocfs2_super *osb;
+ loff_t pstart, plen;
+ u32 p_cluster, num_clusters, slast, spos, tpos;
+ unsigned int ext_flags;
+ int ret = 0;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+ ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+
+ spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+ tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+ slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+
+ while (spos < slast) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+
+ /* Look up the extent. */
+ ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_clusters = min_t(u32, num_clusters, slast - spos);
+
+ /* Punch out the dest range. */
+ pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+ plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+ ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (p_cluster == 0)
+ goto next_loop;
+
+ /* Lock the refcount btree... */
+ ret = ocfs2_lock_refcount_tree(osb,
+ le64_to_cpu(dis->i_refcount_loc),
+ 1, &ref_tree, &ref_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Mark s_inode's extent as refcounted. */
+ if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+ ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+ &ref_tree->rf_ci,
+ ref_root_bh, spos,
+ p_cluster, num_clusters,
+ dealloc, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+ }
+
+ /* Map in the new extent. */
+ ext_flags |= OCFS2_EXT_REFCOUNTED;
+ ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+ &ref_tree->rf_ci,
+ ref_root_bh,
+ tpos, p_cluster,
+ num_clusters,
+ ext_flags,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock_refcount;
+ }
+
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+next_loop:
+ spos += num_clusters;
+ tpos += num_clusters;
+ }
+
+out:
+ return ret;
+out_unlock_refcount:
+ ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+ brelse(ref_root_bh);
+ return ret;
+}
+
+/* Set up refcount tree and remap s_inode to t_inode. */
+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ loff_t pos_in,
+ struct inode *t_inode,
+ struct buffer_head *t_bh,
+ loff_t pos_out,
+ loff_t len)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_super *osb;
+ struct ocfs2_dinode *dis;
+ struct ocfs2_dinode *dit;
+ int ret;
+
+ osb = OCFS2_SB(s_inode->i_sb);
+ dis = (struct ocfs2_dinode *)s_bh->b_data;
+ dit = (struct ocfs2_dinode *)t_bh->b_data;
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /*
+ * If we're reflinking the entire file and the source is inline
+ * data, just copy the contents.
+ */
+ if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+ i_size_read(t_inode) <= len &&
+ (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+ ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * If both inodes belong to two different refcount groups then
+ * forget it because we don't know how (or want) to go merging
+ * refcount trees.
+ */
+ ret = -EOPNOTSUPP;
+ if (ocfs2_is_refcount_inode(s_inode) &&
+ ocfs2_is_refcount_inode(t_inode) &&
+ le64_to_cpu(dis->i_refcount_loc) !=
+ le64_to_cpu(dit->i_refcount_loc))
+ goto out;
+
+ /* Neither inode has a refcount tree. Add one to s_inode. */
+ if (!ocfs2_is_refcount_inode(s_inode) &&
+ !ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Ensure that both inodes end up with the same refcount tree. */
+ if (!ocfs2_is_refcount_inode(s_inode)) {
+ ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+ le64_to_cpu(dit->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ if (!ocfs2_is_refcount_inode(t_inode)) {
+ ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+ le64_to_cpu(dis->i_refcount_loc));
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Turn off inline data in the dest file. */
+ if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* Actually remap extents now. */
+ ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+ pos_out, len, &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ocfs2_dealloc_has_cluster(&dealloc)) {
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+ }
+
+ return ret;
+}
+
+/* Lock an inode and grab a bh pointing to the inode. */
+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+ struct buffer_head **bh1,
+ struct inode *t_inode,
+ struct buffer_head **bh2)
+{
+ struct inode *inode1;
+ struct inode *inode2;
+ struct ocfs2_inode_info *oi1;
+ struct ocfs2_inode_info *oi2;
+ bool same_inode = (s_inode == t_inode);
+ int status;
+
+ /* First grab the VFS and rw locks. */
+ inode1 = s_inode;
+ inode2 = t_inode;
+ if (inode1->i_ino > inode2->i_ino)
+ swap(inode1, inode2);
+
+ inode_lock(inode1);
+ status = ocfs2_rw_lock(inode1, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i1;
+ }
+ if (!same_inode) {
+ inode_lock_nested(inode2, I_MUTEX_CHILD);
+ status = ocfs2_rw_lock(inode2, 1);
+ if (status) {
+ mlog_errno(status);
+ goto out_i2;
+ }
+ }
+
+ /* Now go for the cluster locks */
+ oi1 = OCFS2_I(inode1);
+ oi2 = OCFS2_I(inode2);
+
+ trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+ (unsigned long long)oi2->ip_blkno);
+
+ if (*bh1)
+ *bh1 = NULL;
+ if (*bh2)
+ *bh2 = NULL;
+
+ /* We always want to lock the one with the lower lockid first. */
+ if (oi1->ip_blkno > oi2->ip_blkno)
+ mlog_errno(-ENOLCK);
+
+ /* lock id1 */
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_rw2;
+ }
+
+ /* lock id2 */
+ if (!same_inode) {
+ status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+ OI_LS_REFLINK_TARGET);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ goto out_cl1;
+ }
+ } else
+ *bh2 = *bh1;
+
+ trace_ocfs2_double_lock_end(
+ (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+ (unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+ return 0;
+
+out_cl1:
+ ocfs2_inode_unlock(inode1, 1);
+ brelse(*bh1);
+ *bh1 = NULL;
+out_rw2:
+ ocfs2_rw_unlock(inode2, 1);
+out_i2:
+ inode_unlock(inode2);
+ ocfs2_rw_unlock(inode1, 1);
+out_i1:
+ inode_unlock(inode1);
+ return status;
+}
+
+/* Unlock both inodes and release buffers. */
+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+ struct buffer_head *s_bh,
+ struct inode *t_inode,
+ struct buffer_head *t_bh)
+{
+ ocfs2_inode_unlock(s_inode, 1);
+ ocfs2_rw_unlock(s_inode, 1);
+ inode_unlock(s_inode);
+ brelse(s_bh);
+
+ if (s_inode == t_inode)
+ return;
+
+ ocfs2_inode_unlock(t_inode, 1);
+ ocfs2_rw_unlock(t_inode, 1);
+ inode_unlock(t_inode);
+ brelse(t_bh);
+}
+
+/*
+ * Read a page's worth of file data into the page cache. Return the page
+ * locked.
+ */
+static struct page *ocfs2_reflink_get_page(struct inode *inode,
+ loff_t offset)
+{
+ struct address_space *mapping;
+ struct page *page;
+ pgoff_t n;
+
+ n = offset >> PAGE_SHIFT;
+ mapping = inode->i_mapping;
+ page = read_mapping_page(mapping, n, NULL);
+ if (IS_ERR(page))
+ return page;
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ lock_page(page);
+ return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int ocfs2_reflink_compare_extents(struct inode *src,
+ loff_t srcoff,
+ struct inode *dest,
+ loff_t destoff,
+ loff_t len,
+ bool *is_same)
+{
+ loff_t src_poff;
+ loff_t dest_poff;
+ void *src_addr;
+ void *dest_addr;
+ struct page *src_page;
+ struct page *dest_page;
+ loff_t cmp_len;
+ bool same;
+ int error;
+
+ error = -EINVAL;
+ same = true;
+ while (len) {
+ src_poff = srcoff & (PAGE_SIZE - 1);
+ dest_poff = destoff & (PAGE_SIZE - 1);
+ cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+ cmp_len = min(cmp_len, len);
+ if (cmp_len <= 0) {
+ mlog_errno(-EUCLEAN);
+ goto out_error;
+ }
+
+ src_page = ocfs2_reflink_get_page(src, srcoff);
+ if (IS_ERR(src_page)) {
+ error = PTR_ERR(src_page);
+ goto out_error;
+ }
+ dest_page = ocfs2_reflink_get_page(dest, destoff);
+ if (IS_ERR(dest_page)) {
+ error = PTR_ERR(dest_page);
+ unlock_page(src_page);
+ put_page(src_page);
+ goto out_error;
+ }
+ src_addr = kmap_atomic(src_page);
+ dest_addr = kmap_atomic(dest_page);
+
+ flush_dcache_page(src_page);
+ flush_dcache_page(dest_page);
+
+ if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+ same = false;
+
+ kunmap_atomic(dest_addr);
+ kunmap_atomic(src_addr);
+ unlock_page(dest_page);
+ unlock_page(src_page);
+ put_page(dest_page);
+ put_page(src_page);
+
+ if (!same)
+ break;
+
+ srcoff += cmp_len;
+ destoff += cmp_len;
+ len -= cmp_len;
+ }
+
+ *is_same = same;
+ return 0;
+
+out_error:
+ return error;
+}
+
+/* Link a range of blocks from one file to another. */
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+ struct buffer_head *in_bh = NULL, *out_bh = NULL;
+ loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits;
+ bool same_inode = (inode_in == inode_out);
+ bool is_same = false;
+ loff_t isize;
+ ssize_t ret;
+ loff_t blen;
+
+ if (!ocfs2_refcount_tree(osb))
+ return -EOPNOTSUPP;
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ /* Lock both files against IO */
+ ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+ if (ret)
+ return ret;
+
+ ret = -EINVAL;
+ if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+ (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+ goto out_unlock;
+
+ /* Don't touch certain kinds of inodes */
+ ret = -EPERM;
+ if (IS_IMMUTABLE(inode_out))
+ goto out_unlock;
+
+ ret = -ETXTBSY;
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ goto out_unlock;
+
+ /* Don't reflink dirs, pipes, sockets... */
+ ret = -EISDIR;
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ goto out_unlock;
+ ret = -EINVAL;
+ if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+ goto out_unlock;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ goto out_unlock;
+
+ /* Are we going all the way to the end? */
+ isize = i_size_read(inode_in);
+ if (isize == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (len == 0)
+ len = isize - pos_in;
+
+ /* Ensure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > isize)
+ goto out_unlock;
+
+ /* Don't allow dedupe past EOF in the dest file */
+ if (is_dedupe) {
+ loff_t disize;
+
+ disize = i_size_read(inode_out);
+ if (pos_out >= disize || pos_out + len > disize)
+ goto out_unlock;
+ }
+
+ /* If we're linking to EOF, continue to the block boundary. */
+ if (pos_in + len == isize)
+ blen = ALIGN(isize, bs) - pos_in;
+ else
+ blen = len;
+
+ /* Only reflink if we're aligned to block boundaries */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+ goto out_unlock;
+
+ /* Don't allow overlapped reflink within the same file */
+ if (same_inode) {
+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+ goto out_unlock;
+ }
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
+ pos_in, pos_in + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Check that the extents are the same.
+ */
+ if (is_dedupe) {
+ ret = ocfs2_reflink_compare_extents(inode_in, pos_in,
+ inode_out, pos_out,
+ len, &is_same);
+ if (ret)
+ goto out_unlock;
+ if (!is_same) {
+ ret = -EBADE;
+ goto out_unlock;
+ }
+ }
+
+ /* Lock out changes to the allocation maps */
+ down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+ /*
+ * Invalidate the page cache so that we can clear any CoW mappings
+ * in the destination file.
+ */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + len) - 1);
+
+ ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+ out_bh, pos_out, len);
+
+ up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+ if (!same_inode)
+ up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ /*
+ * Empty the extent map so that we may get the right extent
+ * record from the disk.
+ */
+ ocfs2_extent_map_trunc(inode_in, 0);
+ ocfs2_extent_map_trunc(inode_out, 0);
+
+ ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return 0;
+
+out_unlock:
+ ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+ return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 6422bbc..4af55bf 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve);
+int ocfs2_reflink_remap_range(struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe);
+
#endif /* OCFS2_REFCOUNTTREE_H */
^ permalink raw reply related [flat|nested] 42+ messages in thread