All of lore.kernel.org
 help / color / mirror / Atom feed
From: Maxim Patlasov <MPatlasov@parallels.com>
To: miklos@szeredi.hu
Cc: riel@redhat.com, dev@parallels.com, xemul@parallels.com,
	fuse-devel@lists.sourceforge.net, bfoster@redhat.com,
	linux-kernel@vger.kernel.org, jbottomley@parallels.com,
	linux-mm@kvack.org, viro@zeniv.linux.org.uk,
	linux-fsdevel@vger.kernel.org, akpm@linux-foundation.org,
	fengguang.wu@intel.com, devel@openvz.org, mgorman@suse.de
Subject: [PATCH 14/16] fuse: Fix O_DIRECT operations vs cached writeback misorder - v2
Date: Sat, 29 Jun 2013 21:46:49 +0400	[thread overview]
Message-ID: <20130629174633.20175.16915.stgit@maximpc.sw.ru> (raw)
In-Reply-To: <20130629172211.20175.70154.stgit@maximpc.sw.ru>

From: Pavel Emelyanov <xemul@openvz.org>

The problem is:

1. write cached data to a file
2. read directly from the same file (via another fd)

The 2nd operation may read stale data, i.e. the one that was in a file
before the 1st op. Problem is in how fuse manages writeback.

When direct op occurs the core kernel code calls filemap_write_and_wait
to flush all the cached ops in flight. But fuse acks the writeback right
after the ->writepages callback exits w/o waiting for the real write to
happen. Thus the subsequent direct op proceeds while the real writeback
is still in flight. This is a problem for backends that reorder operation.

Fix this by making the fuse direct IO callback explicitly wait on the
in-flight writeback to finish.

Changed in v2:
 - do not wait on writeback if fuse_direct_io() call came from
   CUSE (because it doesn't use fuse inodes)

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
---
 fs/fuse/cuse.c   |    5 +++--
 fs/fuse/file.c   |   49 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/fuse/fuse_i.h |   13 ++++++++++++-
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index aef34b1..b57b5ee 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
 	struct iovec iov = { .iov_base = buf, .iov_len = count };
 	struct fuse_io_priv io = { .async = 0, .file = file };
 
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
 }
 
 static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
 	 * No locking or generic_write_checks(), the server is
 	 * responsible for locking and sanity checks.
 	 */
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
+	return fuse_direct_io(&io, &iov, 1, count, &pos,
+			      FUSE_DIO_WRITE | FUSE_DIO_CUSE);
 }
 
 static int cuse_open(struct inode *inode, struct file *file)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 86ef60f..ea45cf3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -342,6 +342,31 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 	return (u64) v0 + ((u64) v1 << 32);
 }
 
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+				    pgoff_t idx_to)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	bool found = false;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (!(idx_from >= curr_index + req->num_pages ||
+		      idx_to < curr_index)) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
 /*
  * Check if page is under writeback
  *
@@ -386,6 +411,19 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 	return 0;
 }
 
+static void fuse_wait_on_writeback(struct inode *inode, pgoff_t start,
+				   size_t bytes)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	pgoff_t idx_from, idx_to;
+
+	idx_from = start >> PAGE_CACHE_SHIFT;
+	idx_to = (start + bytes - 1) >> PAGE_CACHE_SHIFT;
+
+	wait_event(fi->page_waitq,
+		   !fuse_range_is_writeback(inode, idx_from, idx_to));
+}
+
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file_inode(file);
@@ -1360,8 +1398,10 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
 
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write)
+		       int flags)
 {
+	int write = flags & FUSE_DIO_WRITE;
+	int cuse = flags & FUSE_DIO_CUSE;
 	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
@@ -1390,6 +1430,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 			break;
 		}
 
+		if (!cuse)
+			fuse_wait_on_writeback(file->f_mapping->host, pos,
+					       nbytes);
+
 		if (write)
 			nres = fuse_send_write(req, io, pos, nbytes, owner);
 		else
@@ -1468,7 +1512,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
 
 	res = generic_write_checks(file, ppos, &count, 0);
 	if (!res)
-		res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
+		res = fuse_direct_io(io, iov, nr_segs, count, ppos,
+				     FUSE_DIO_WRITE);
 
 	fuse_invalidate_attr(inode);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 82abb82..f969b22 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -859,9 +859,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE  (1 << 1)
+
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write);
+		       int flags);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,


WARNING: multiple messages have this Message-ID (diff)
From: Maxim Patlasov <MPatlasov@parallels.com>
To: miklos@szeredi.hu
Cc: riel@redhat.com, dev@parallels.com, xemul@parallels.com,
	fuse-devel@lists.sourceforge.net, bfoster@redhat.com,
	linux-kernel@vger.kernel.org, jbottomley@parallels.com,
	linux-mm@kvack.org, viro@zeniv.linux.org.uk,
	linux-fsdevel@vger.kernel.org, akpm@linux-foundation.org,
	fengguang.wu@intel.com, devel@openvz.org, mgorman@suse.de
Subject: [PATCH 14/16] fuse: Fix O_DIRECT operations vs cached writeback misorder - v2
Date: Sat, 29 Jun 2013 21:46:49 +0400	[thread overview]
Message-ID: <20130629174633.20175.16915.stgit@maximpc.sw.ru> (raw)
In-Reply-To: <20130629172211.20175.70154.stgit@maximpc.sw.ru>

From: Pavel Emelyanov <xemul@openvz.org>

The problem is:

1. write cached data to a file
2. read directly from the same file (via another fd)

The 2nd operation may read stale data, i.e. the one that was in a file
before the 1st op. Problem is in how fuse manages writeback.

When direct op occurs the core kernel code calls filemap_write_and_wait
to flush all the cached ops in flight. But fuse acks the writeback right
after the ->writepages callback exits w/o waiting for the real write to
happen. Thus the subsequent direct op proceeds while the real writeback
is still in flight. This is a problem for backends that reorder operation.

Fix this by making the fuse direct IO callback explicitly wait on the
in-flight writeback to finish.

Changed in v2:
 - do not wait on writeback if fuse_direct_io() call came from
   CUSE (because it doesn't use fuse inodes)

Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
---
 fs/fuse/cuse.c   |    5 +++--
 fs/fuse/file.c   |   49 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/fuse/fuse_i.h |   13 ++++++++++++-
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index aef34b1..b57b5ee 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
 	struct iovec iov = { .iov_base = buf, .iov_len = count };
 	struct fuse_io_priv io = { .async = 0, .file = file };
 
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
 }
 
 static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
 	 * No locking or generic_write_checks(), the server is
 	 * responsible for locking and sanity checks.
 	 */
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
+	return fuse_direct_io(&io, &iov, 1, count, &pos,
+			      FUSE_DIO_WRITE | FUSE_DIO_CUSE);
 }
 
 static int cuse_open(struct inode *inode, struct file *file)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 86ef60f..ea45cf3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -342,6 +342,31 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 	return (u64) v0 + ((u64) v1 << 32);
 }
 
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+				    pgoff_t idx_to)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	bool found = false;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (!(idx_from >= curr_index + req->num_pages ||
+		      idx_to < curr_index)) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
 /*
  * Check if page is under writeback
  *
@@ -386,6 +411,19 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 	return 0;
 }
 
+static void fuse_wait_on_writeback(struct inode *inode, pgoff_t start,
+				   size_t bytes)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	pgoff_t idx_from, idx_to;
+
+	idx_from = start >> PAGE_CACHE_SHIFT;
+	idx_to = (start + bytes - 1) >> PAGE_CACHE_SHIFT;
+
+	wait_event(fi->page_waitq,
+		   !fuse_range_is_writeback(inode, idx_from, idx_to));
+}
+
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file_inode(file);
@@ -1360,8 +1398,10 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
 
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write)
+		       int flags)
 {
+	int write = flags & FUSE_DIO_WRITE;
+	int cuse = flags & FUSE_DIO_CUSE;
 	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
@@ -1390,6 +1430,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 			break;
 		}
 
+		if (!cuse)
+			fuse_wait_on_writeback(file->f_mapping->host, pos,
+					       nbytes);
+
 		if (write)
 			nres = fuse_send_write(req, io, pos, nbytes, owner);
 		else
@@ -1468,7 +1512,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
 
 	res = generic_write_checks(file, ppos, &count, 0);
 	if (!res)
-		res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
+		res = fuse_direct_io(io, iov, nr_segs, count, ppos,
+				     FUSE_DIO_WRITE);
 
 	fuse_invalidate_attr(inode);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 82abb82..f969b22 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -859,9 +859,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE  (1 << 1)
+
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write);
+		       int flags);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2013-06-29 17:47 UTC|newest]

Thread overview: 79+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-29 17:41 [PATCH v5 00/16] fuse: An attempt to implement a write-back cache policy Maxim Patlasov
2013-06-29 17:41 ` Maxim Patlasov
2013-06-29 17:42 ` [PATCH 01/16] fuse: Linking file to inode helper Maxim Patlasov
2013-06-29 17:42   ` Maxim Patlasov
2013-06-29 17:42 ` [PATCH 02/16] fuse: Getting file for writeback helper Maxim Patlasov
2013-06-29 17:42   ` Maxim Patlasov
2013-06-29 17:42 ` [PATCH 03/16] fuse: Prepare to handle short reads Maxim Patlasov
2013-06-29 17:42   ` Maxim Patlasov
2013-06-29 17:42   ` Maxim Patlasov
2013-06-29 17:42 ` [PATCH 04/16] fuse: Prepare to handle multiple pages in writeback Maxim Patlasov
2013-06-29 17:42   ` Maxim Patlasov
2013-06-29 17:42 ` [PATCH 05/16] fuse: Connection bit for enabling writeback Maxim Patlasov
2013-06-29 17:42   ` Maxim Patlasov
2013-06-29 17:44 ` [PATCH 06/16] fuse: Trust kernel i_size only - v4 Maxim Patlasov
2013-06-29 17:44   ` Maxim Patlasov
2013-06-29 17:44 ` [PATCH 07/16] fuse: Trust kernel i_mtime only Maxim Patlasov
2013-06-29 17:44   ` Maxim Patlasov
2013-07-11 16:14   ` [PATCH 07/16] fuse: Trust kernel i_mtime only -v2 Maxim Patlasov
2013-06-29 17:45 ` [PATCH 08/16] fuse: Flush files on wb close Maxim Patlasov
2013-06-29 17:45   ` Maxim Patlasov
2013-07-11 16:18   ` [PATCH 08/16] fuse: Flush files on wb close -v2 Maxim Patlasov
2013-06-29 17:45 ` [PATCH 09/16] fuse: restructure fuse_readpage() Maxim Patlasov
2013-06-29 17:45   ` Maxim Patlasov
2013-06-29 17:45   ` Maxim Patlasov
2013-06-29 17:45 ` [PATCH 10/16] fuse: Implement writepages callback Maxim Patlasov
2013-06-29 17:45   ` Maxim Patlasov
2013-07-19 16:50   ` Miklos Szeredi
2013-07-19 16:50     ` Miklos Szeredi
2013-08-02 15:40     ` Maxim Patlasov
2013-08-02 15:40       ` Maxim Patlasov
2013-08-02 15:40       ` Maxim Patlasov
2013-08-06 16:25       ` Miklos Szeredi
2013-08-06 16:25         ` Miklos Szeredi
2013-08-06 16:26         ` Eric Boxer
2013-08-09 15:02         ` Maxim Patlasov
2013-08-09 15:02           ` Maxim Patlasov
2013-08-09 15:02           ` Maxim Patlasov
2013-08-30 10:12           ` Miklos Szeredi
2013-08-30 10:12             ` Miklos Szeredi
2013-08-30 10:12             ` Miklos Szeredi
2013-08-30 14:50             ` Maxim Patlasov
2013-08-30 14:50               ` Maxim Patlasov
2013-08-30 14:50               ` Maxim Patlasov
2013-09-03 10:31               ` Miklos Szeredi
2013-09-03 10:31                 ` Miklos Szeredi
2013-09-03 10:31                 ` Miklos Szeredi
2013-09-03 16:02                 ` Maxim Patlasov
2013-09-03 16:02                   ` Maxim Patlasov
2013-09-03 16:02                   ` Maxim Patlasov
2013-06-29 17:45 ` [PATCH 11/16] fuse: Implement write_begin/write_end callbacks Maxim Patlasov
2013-06-29 17:45   ` Maxim Patlasov
2013-06-29 17:46 ` [PATCH 12/16] fuse: fuse_writepage_locked() should wait on writeback Maxim Patlasov
2013-06-29 17:46   ` Maxim Patlasov
2013-06-29 17:46 ` [PATCH 13/16] fuse: fuse_flush() " Maxim Patlasov
2013-06-29 17:46   ` Maxim Patlasov
2013-06-29 17:46 ` Maxim Patlasov [this message]
2013-06-29 17:46   ` [PATCH 14/16] fuse: Fix O_DIRECT operations vs cached writeback misorder - v2 Maxim Patlasov
2013-06-29 17:47 ` [PATCH 15/16] fuse: Turn writeback cache on Maxim Patlasov
2013-06-29 17:47   ` Maxim Patlasov
2013-06-29 17:48 ` [PATCH 16/16] mm: strictlimit feature Maxim Patlasov
2013-06-29 17:48   ` Maxim Patlasov
2013-07-01 21:16   ` Andrew Morton
2013-07-01 21:16     ` Andrew Morton
2013-07-02  8:33     ` Maxim Patlasov
2013-07-02  8:33       ` Maxim Patlasov
2013-07-02  8:33       ` Maxim Patlasov
2013-07-02 17:44   ` [PATCH] mm: strictlimit feature -v2 Maxim Patlasov
2013-07-02 17:44     ` Maxim Patlasov
2013-07-02 19:38     ` Andrew Morton
2013-07-02 19:38       ` Andrew Morton
2013-07-03 11:01       ` Maxim Patlasov
2013-07-03 11:01         ` Maxim Patlasov
2013-07-03 11:01         ` Maxim Patlasov
2013-07-03 23:16         ` Jan Kara
2013-07-03 23:16           ` Jan Kara
2013-07-03 23:16           ` Jan Kara
2013-07-05 13:14           ` Maxim Patlasov
2013-07-05 13:14             ` Maxim Patlasov
2013-07-05 13:14             ` Maxim Patlasov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130629174633.20175.16915.stgit@maximpc.sw.ru \
    --to=mpatlasov@parallels.com \
    --cc=akpm@linux-foundation.org \
    --cc=bfoster@redhat.com \
    --cc=dev@parallels.com \
    --cc=devel@openvz.org \
    --cc=fengguang.wu@intel.com \
    --cc=fuse-devel@lists.sourceforge.net \
    --cc=jbottomley@parallels.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=miklos@szeredi.hu \
    --cc=riel@redhat.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xemul@parallels.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.