[PATCH v3] vfs: fix a bug when we do some dio reads with append dio writes

* [PATCH v3] vfs: fix a bug when we do some dio reads with append dio writes
@ 2013-12-07 10:55 Zheng Liu
  2013-12-16  9:37 ` Steven Whitehouse
  0 siblings, 1 reply; 21+ messages in thread
From: Zheng Liu @ 2013-12-07 10:55 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Jan Kara, Christoph Hellwig, Dmitry Monakhov, Dave Chinner,
	Alexander Viro, Zheng Liu

From: Zheng Liu <wenqing.lz@taobao.com>

Currently we check i_size in buffered read path after we know the page
is update.  But it could return some zero-filled pages to the userspace
when we do some append dio writes.  We could use the following code
snippet to reproduce this problem in a ext2/3/4 filesystem.

code snippet:
  #define _GNU_SOURCE

  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <memory.h>

  #include <unistd.h>
  #include <fcntl.h>
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <errno.h>

  #include <pthread.h>

  #define BUF_ALIGN	1024

  struct writer_data {
  	int fd;
  	size_t blksize;
  	char *buf;
  };

  static void *writer(void *arg)
  {
  	struct writer_data *data = (struct writer_data *)arg;
  	int ret;

  	ret = write(data->fd, data->buf, data->blksize);
  	if (ret < 0)
  		fprintf(stderr, "write file failed: %s\n", strerror(errno));

  	return NULL;
  }

  int main(int argc, char *argv[])
  {
  	pthread_t tid;
  	struct writer_data wdata;
  	size_t max_blocks = 10 * 1024;
  	size_t blksize = 1 * 1024 * 1024;
  	char *rbuf, *wbuf;
  	int readfd, writefd;
  	int i, j;

  	if (argc < 2) {
  		fprintf(stderr, "usage: %s [filename]\n", argv[0]);
  		exit(1);
  	}

  	writefd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
  	if (writefd < 0) {
  		fprintf(stderr, "failed to open wfile: %s\n", strerror(errno));
  		exit(1);
  	}
  	readfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
  	if (readfd < 0) {
  		fprintf(stderr, "failed to open rfile: %s\n", strerror(errno));
  		exit(1);
  	}

  	if (posix_memalign((void **)&wbuf, BUF_ALIGN, blksize)) {
  		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
  		exit(1);
  	}

  	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
  		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
  		exit(1);
  	}

  	memset(wbuf, 'a', blksize);

  	wdata.fd = writefd;
  	wdata.blksize = blksize;
  	wdata.buf = wbuf;

  	for (i = 0; i < max_blocks; i++) {
  		void *retval;
  		int ret;

  		ret = pthread_create(&tid, NULL, writer, &wdata);
  		if (ret) {
  			fprintf(stderr, "create thread failed: %s\n", strerror(errno));
  			exit(1);
  		}

  		memset(rbuf, 'b', blksize);
  		do {
  			ret = pread(readfd, rbuf, blksize, i * blksize);
  		} while (ret <= 0);

  		if (ret < 0) {
  			fprintf(stderr, "read file failed: %s\n", strerror(errno));
  			exit(1);
  		}

  		if (pthread_join(tid, &retval)) {
  			fprintf(stderr, "pthread join failed: %s\n", strerror(errno));
  			exit(1);
  		}

  		if (ret >= 0) {
  			for (j = 0; j < ret; j++) {
  				if (rbuf[j] != 'a') {
  					fprintf(stderr, "encounter an error: offset %ld\n",
  						i);
  					goto err;
  				}
  			}
  		}
  	}

  err:
  	free(wbuf);
  	free(rbuf);

  	return 0;
  }

build & run:
  $ gcc code.c -o code -lpthread # build
  $ ./code ${filename} # run

As we expected, we should read nothing or data with 'a'.  But now we
read data with '0'.  I take a closer look at the code and it seems that
there is a bug in vfs.  Let me describe my found here.

  reader					writer
                                                generic_file_aio_write()
                                                ->__generic_file_aio_write()
                                                  ->generic_file_direct_write()
  generic_file_aio_read()
  ->do_generic_file_read()
    [fallback to buffered read]

    ->find_get_page()
    ->page_cache_sync_readahead()
    ->find_get_page()
    [in find_page label, we couldn't find a
     page before and after calling
     page_cache_sync_readahead().  So go to
     no_cached_page label]

    ->page_cache_alloc_cold()
    ->add_to_page_cache_lru()
    [in no_cached_page label, we alloc a page
     and goto readpage label.]

    ->aops->readpage()
    [in readpage label, readpage() callback
     is called and mpage_readpage() return a
     zero-filled page (e.g. ext3/4), and go
     to page_ok label]

                                                  ->a_ops->direct_IO()
                                                  ->i_size_write()
                                                  [we enlarge the i_size]

    Here we check i_size
    [in page_ok label, we check i_size but
     it has been enlarged.  Thus, we pass
     the check and return a zero-filled page]

This commit first trims desc->count to avoid to read too much data if
(*ppos + desc->count) >= i_size.  Then we will return directly if
desc->count == 0.  After doing that, we could fix the above problem.
Meanwhile this also can fix the problem that the reader does some
buffered reads with append dio write.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dmitry Monakhov <dmonakhov@openvz.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
---
changelog:
v3: 
 * Rebase against 3.13-rc3
 * Add 'Reviewed-by' tag because Jan has reviewed this patch

						- Zheng
 mm/filemap.c |    9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021c..622d49ac2a24 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1108,18 +1108,25 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
 	pgoff_t prev_index;
 	unsigned long offset;      /* offset into pagecache page */
 	unsigned int prev_offset;
+	loff_t isize;
 	int error;
 
+	/* we need to trim desc->count to avoid expose stale data to user */
+	isize = i_size_read(inode);
+	if (*ppos + desc->count >= isize)
+		desc->count = isize - *ppos;
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
 	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
 	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 
+	if (desc->count == 0)
+		goto out;
+
 	for (;;) {
 		struct page *page;
 		pgoff_t end_index;
-		loff_t isize;
 		unsigned long nr, ret;
 
 		cond_resched();
-- 
1.7.9.7


^ permalink raw reply related	[flat|nested] 21+ messages in thread