All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Btrfs: fix fsync data loss after a ranged fsync
@ 2014-08-30 17:00 Filipe Manana
  2014-09-02 12:50 ` [PATCH v2] " Filipe Manana
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Filipe Manana @ 2014-08-30 17:00 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Filipe Manana

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
while we're logging the inode and that fall outside the fsync range.

This means we can get extent maps outside our range added to the
inode's extent map tree's modified list for which the corresponding
ordered operation wasn't captured by our call to btrfs_get_logged_extents() -
the fill delalloc callbacks, inode.c:cow_file_range() and
inode.c:submit_compressed_extents() add an extent map to the modified
list before creating the respective ordered operation - and they do this
without holding the inode's mutex nor the inode's log mutex.

Therefore when a full ranged fsync finishes don't remove every extent
map from the modified list of extent maps - as for some of them, that fall
outside our fsync range, we might have not waited for their respective
ordered operation to finish (meaning the corresponding file extent item
wasn't inserted into the fs/subvol tree yet), and we must let the next
fsync (very likely a fast one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait for
its ordered operation to finish (if it's still ongoing).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/file.c     |  2 +-
 fs/btrfs/tree-log.c | 50 ++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c4076..e5534c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1979,7 +1979,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	btrfs_init_log_ctx(&ctx);
 
-	ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
 		ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a917a6..8b18a2d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, struct inode *inode,
-			     int inode_only);
+			   struct btrfs_root *root, struct inode *inode,
+			   int inode_only,
+			   const loff_t start,
+			   const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
@@ -3856,8 +3858,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, struct inode *inode,
-			     int inode_only)
+			   struct btrfs_root *root, struct inode *inode,
+			   int inode_only,
+			   const loff_t start,
+			   const loff_t end)
 {
 	struct btrfs_path *path;
 	struct btrfs_path *dst_path;
@@ -4050,8 +4054,27 @@ log_extents:
 		struct extent_map *em, *n;
 
 		write_lock(&tree->lock);
-		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+		/*
+		 * We can't just remove every em if we're called for a ranged
+		 * fsync - that is, one that doesn't cover the whole possible
+		 * file range (0 to LLONG_MAX). This is because we can have
+		 * em's that fall outside the range and therefore their ordered
+		 * operations haven't completed yet (btrfs_finish_ordered_io()
+		 * not invoked yet). Their ordered operations might have started
+		 * after we called btrfs_get_logged_extents() too, so we don't
+		 * end up waiting for them to complete when syncing the log.
+		 * Removing every em outside the range would make a subsequent
+		 * fsync that does a "fast search" (BTRFS_INODE_NEEDS_FULL_SYNC
+		 * flag not set) not log the extent represented by an em,
+		 * therefore making us lose data after a log replay.
+		 */
+		list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+			if (em->mod_start > end)
+				continue;
+			if (em->mod_start + em->mod_len <= start)
+				continue;
 			list_del_init(&em->list);
+		}
 		write_unlock(&tree->lock);
 	}
 
@@ -4158,7 +4181,10 @@ out:
  */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 			    	  struct btrfs_root *root, struct inode *inode,
-			    	  struct dentry *parent, int exists_only,
+				  struct dentry *parent,
+				  const loff_t start,
+				  const loff_t end,
+				  int exists_only,
 				  struct btrfs_log_ctx *ctx)
 {
 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4204,7 +4230,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto end_no_trans;
 
-	ret = btrfs_log_inode(trans, root, inode, inode_only);
+	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
 	if (ret)
 		goto end_trans;
 
@@ -4232,7 +4258,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
 		if (BTRFS_I(inode)->generation >
 		    root->fs_info->last_trans_committed) {
-			ret = btrfs_log_inode(trans, root, inode, inode_only);
+			ret = btrfs_log_inode(trans, root, inode, inode_only,
+					      0, LLONG_MAX);
 			if (ret)
 				goto end_trans;
 		}
@@ -4266,13 +4293,15 @@ end_no_trans:
  */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry,
+			  const loff_t start,
+			  const loff_t end,
 			  struct btrfs_log_ctx *ctx)
 {
 	struct dentry *parent = dget_parent(dentry);
 	int ret;
 
 	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
-				     0, ctx);
+				     start, end, 0, ctx);
 	dput(parent);
 
 	return ret;
@@ -4509,6 +4538,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		    root->fs_info->last_trans_committed))
 		return 0;
 
-	return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+				      LLONG_MAX, 1, NULL);
 }
 
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41b..e2e798a 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry,
+			  const loff_t start,
+			  const loff_t end,
 			  struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-09-08 20:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-30 17:00 [PATCH] Btrfs: fix fsync data loss after a ranged fsync Filipe Manana
2014-09-02 12:50 ` [PATCH v2] " Filipe Manana
2014-09-05 18:14 ` [PATCH v3] " Filipe Manana
2014-09-06 21:34 ` [PATCH v4] " Filipe Manana
2014-09-08 21:56 ` [PATCH v5] " Filipe Manana

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.