All of lore.kernel.org
 help / color / mirror / Atom feed
* buggy EOFBLOCKS_FL handling
@ 2010-08-19  3:01 Theodore Ts'o
  2010-08-19  3:04 ` [PATCH, RFC] ext4: Fix " Theodore Ts'o
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Theodore Ts'o @ 2010-08-19  3:01 UTC (permalink / raw)
  To: linux-ext4


It looks like how we handle the EOFBLOCKS_FL flag is buggy.  This means
that when we fallocate a file to have 128k using the KEEP_SIZE flag, and
then write exactly 128k, the EOFBLOCKS_FL isn't getting cleared
correctly.

This is bad, because e2fsck will then complain about that inode.  If you
have a large number of inodes that are written with fallocate using
KEEP_SIZE, and then fill them up to their expected size, e2fsck will
potentially complain about a _huge_ number of inodes.

A proposed patch to fix this is forthcoming....

						- Ted

/*
 * Testcase for Google Bug 2928259
 *
 * Run this program while the current directory is in an ext4 filesystem,
 * then umount the file system and do a forced fsck (i.e., fsck -f /dev/XXX).
 *
 * If you get a e2fsck reported corruption, then the kernel is buggy:
 *
 * Inode 12 should not have EOFBLOCKS_FL set (size 40960, lblk 9)
 * Clear<y>? yes
 */

#define _GNU_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <getopt.h>
#include <errno.h>

#define FALLOC_FL_KEEP_SIZE     0x01

#ifndef SYS_fallocate
#ifdef __i386__
/* 32-bits */
#define SYS_fallocate 324
#elif __amd64__
/* 64-bits */
#define SYS_fallocate 285
#endif
#endif

int main(int argc, char **argv)
{
	int fd, ret, c;
	char *buf, *tmp;
	unsigned long fsize = 40960;
	unsigned long wsize = 40960;
	struct stat st;
	int flags = O_CREAT|O_TRUNC|O_RDWR;

	while ((c = getopt(argc, argv, "df:w:")) != EOF) {
		switch (c) {
		case 'd':
			flags |= O_DIRECT;
			break;
		case 'f':
			fsize = strtoul(optarg, &tmp, 0);
			if (*tmp) {
				fprintf(stderr, "Bad fsize - %s\n", optarg);
				exit(1);
			}
			break;
		case 'w':
			wsize = strtoul(optarg, &tmp, 0);
			if (*tmp) {
				fprintf(stderr, "Bad wsize - %s\n", optarg);
				exit(1);
			}
			break;
		default:
			fprintf(stderr, "Usage: testcase [-d] "
				"-f fallocate_size -w write_size\n");
		}
	}

	fd = open("test-file", flags, 0644);
	if (fd < 0) {
		perror("open");
		exit(1);
	}
	ret = syscall(SYS_fallocate, fd, FALLOC_FL_KEEP_SIZE, 0ULL,
		      (unsigned long long) fsize);
	if (ret) {
		perror("fallocate");
		exit(1);
	}
	if ((ret = posix_memalign((void **) &buf, 4096, wsize)) != 0) {
		errno = ret;
		perror("posix_memalign");
	}
	memset(buf, 0, wsize);
	ret = write(fd, buf, wsize);
	if (ret < 0) {
		perror("write");
		exit(1);
	} else if (ret != wsize) {
		fprintf(stderr, "Short write: actual %d, expected %lu\n",
			ret, wsize);
		exit(1);
	}
	if (fstat(fd, &st) < 0) {
		perror("fstat");
		exit(1);
	}
	printf("test-file has inode number %lu\n", (unsigned long) st.st_ino);
	printf("size is %lu, blocks*512 is %lu\n", (unsigned long) st.st_size,
	       (unsigned long) st.st_blocks*512);
	close(fd);
	exit(0);
}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH, RFC] ext4: Fix EOFBLOCKS_FL handling
  2010-08-19  3:01 buggy EOFBLOCKS_FL handling Theodore Ts'o
@ 2010-08-19  3:04 ` Theodore Ts'o
  2010-08-21 21:07   ` [PATCH -v2] " Theodore Ts'o
  2010-08-19  5:13 ` buggy " Andreas Dilger
  2010-08-21 20:11 ` Updated test case Ted Ts'o
  2 siblings, 1 reply; 13+ messages in thread
From: Theodore Ts'o @ 2010-08-19  3:04 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: Theodore Ts'o

It turns out we have several problems with how EOFBLOCKS_FL is
handled.  First of all, there was a fencepost error where we were not
clearing the EOFBLOCKS_FL when fill in the last uninitialized block,
but rather when we allocate the next block _after_ the uninitalized
block.  Secondly we were not testing to see if we needed to clear the
EOFBLOCKS_FL when writing to the file O_DIRECT or when were converting
an uninitialized block (which is the most common case).

Google-Bug-Id: 2928259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |   89 +++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3..37cf887 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3180,6 +3180,48 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                 unmap_underlying_metadata(bdev, block + i);
 }
 
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+			      struct ext4_map_blocks *map,
+			      struct ext4_ext_path *path,
+			      unsigned int allocated)
+{
+	int i, depth;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex, *last_ex;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		return 0;
+
+	if (unlikely(!eh->eh_entries)) {
+		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+				 "EOFBLOCKS_FL set");
+		return -EIO;
+	}
+	last_ex = EXT_LAST_EXTENT(eh);
+	/*
+	 * If the current leaf block was reached by looking at the
+	 * last index block all the way down the tree, and we are
+	 * extending the inode beyond the last extent in the current
+	 * leaf block, then clear the EOFBLOCKS_FL flag.
+	 */
+	for (i = depth-1; i >= 0; i--) {
+		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+			break;
+	}
+	if ((i < 0) &&
+	    (map->m_lblk + allocated >= le32_to_cpu(last_ex->ee_block) +
+	     ext4_ext_get_actual_len(last_ex)))
+		ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	return ext4_mark_inode_dirty(handle, inode);
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
@@ -3217,8 +3259,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
 							path);
-		if (ret >= 0)
+		if (ret >= 0) {
 			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, map, path,
+						 allocated);
+		} else
+			err = ret;
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3244,8 +3290,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	/* buffered write, writepage time, convert*/
 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-	if (ret >= 0)
+	if (ret >= 0) {
 		ext4_update_inode_fsync_trans(handle, inode, 1);
+		err = check_eofblocks_fl(handle, inode, map, path, allocated);
+		if (err < 0)
+			goto out2;
+	}
+
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3292,6 +3343,7 @@ out2:
 	}
 	return err ? err : allocated;
 }
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3315,9 +3367,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
-	struct ext4_extent newex, *ex, *last_ex;
+	struct ext4_extent newex, *ex;
 	ext4_fsblk_t newblock;
-	int i, err = 0, depth, ret, cache_type;
+	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3497,31 +3549,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
-		if (unlikely(!eh->eh_entries)) {
-			EXT4_ERROR_INODE(inode,
-					 "eh->eh_entries == 0 and "
-					 "EOFBLOCKS_FL set");
-			err = -EIO;
-			goto out2;
-		}
-		last_ex = EXT_LAST_EXTENT(eh);
-		/*
-		 * If the current leaf block was reached by looking at
-		 * the last index block all the way down the tree, and
-		 * we are extending the inode beyond the last extent
-		 * in the current leaf block, then clear the
-		 * EOFBLOCKS_FL flag.
-		 */
-		for (i = depth-1; i >= 0; i--) {
-			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-				break;
-		}
-		if ((i < 0) &&
-		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-		     ext4_ext_get_actual_len(last_ex)))
-			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-	}
+	err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+	if (err)
+		goto out2;
+
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
 		/* free data blocks we just allocated */
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: buggy EOFBLOCKS_FL handling
  2010-08-19  3:01 buggy EOFBLOCKS_FL handling Theodore Ts'o
  2010-08-19  3:04 ` [PATCH, RFC] ext4: Fix " Theodore Ts'o
@ 2010-08-19  5:13 ` Andreas Dilger
  2010-08-19 14:44   ` Ted Ts'o
  2010-08-21 20:11 ` Updated test case Ted Ts'o
  2 siblings, 1 reply; 13+ messages in thread
From: Andreas Dilger @ 2010-08-19  5:13 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: linux-ext4

On 2010-08-18, at 21:01, Theodore Ts'o wrote:
> It looks like how we handle the EOFBLOCKS_FL flag is buggy.  This means
> that when we fallocate a file to have 128k using the KEEP_SIZE flag, and
> then write exactly 128k, the EOFBLOCKS_FL isn't getting cleared
> correctly.
> 
> This is bad, because e2fsck will then complain about that inode.  If you
> have a large number of inodes that are written with fallocate using
> KEEP_SIZE, and then fill them up to their expected size, e2fsck will
> potentially complain about a _huge_ number of inodes.

Probably e2fsck also shouldn't complain if EOFBLOCKS_FL is set, but the i_size is within the range implied by i_blocks.

Cheers, Andreas






^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: buggy EOFBLOCKS_FL handling
  2010-08-19  5:13 ` buggy " Andreas Dilger
@ 2010-08-19 14:44   ` Ted Ts'o
  2010-08-19 17:03     ` Eric Sandeen
  0 siblings, 1 reply; 13+ messages in thread
From: Ted Ts'o @ 2010-08-19 14:44 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: linux-ext4

On Wed, Aug 18, 2010 at 11:13:00PM -0600, Andreas Dilger wrote:
> On 2010-08-18, at 21:01, Theodore Ts'o wrote:
> > It looks like how we handle the EOFBLOCKS_FL flag is buggy.  This means
> > that when we fallocate a file to have 128k using the KEEP_SIZE flag, and
> > then write exactly 128k, the EOFBLOCKS_FL isn't getting cleared
> > correctly.
> > 
> > This is bad, because e2fsck will then complain about that inode.  If you
> > have a large number of inodes that are written with fallocate using
> > KEEP_SIZE, and then fill them up to their expected size, e2fsck will
> > potentially complain about a _huge_ number of inodes.
> 
> Probably e2fsck also shouldn't complain if EOFBLOCKS_FL is set, but
> the i_size is within the range implied by i_blocks.

My current thinking is to have an EOFBLOCKS_relaxed mode setting in
/etc/e2fsck.conf which controls whether we test for this case or not.
Technically it *is* an error, but if there are file systems with a
large number of files in this state, running e2fsck could take a
***very*** long time (potentially, hours longer than would otherwise
be expected).  Hopefully once the bug fix gets pushed out, eventually
we'll be able to turn this feature off.  (Where eventually might be a
year or two, given that it's probably too late to get this fixed in
RHEL 6.  :-( )

	   	    	    	      - Ted


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: buggy EOFBLOCKS_FL handling
  2010-08-19 14:44   ` Ted Ts'o
@ 2010-08-19 17:03     ` Eric Sandeen
  2010-08-19 17:11       ` Ted Ts'o
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Sandeen @ 2010-08-19 17:03 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: Andreas Dilger, linux-ext4

Ted Ts'o wrote:
> On Wed, Aug 18, 2010 at 11:13:00PM -0600, Andreas Dilger wrote:
>> On 2010-08-18, at 21:01, Theodore Ts'o wrote:
>>> It looks like how we handle the EOFBLOCKS_FL flag is buggy.  This means
>>> that when we fallocate a file to have 128k using the KEEP_SIZE flag, and
>>> then write exactly 128k, the EOFBLOCKS_FL isn't getting cleared
>>> correctly.
>>>
>>> This is bad, because e2fsck will then complain about that inode.  If you
>>> have a large number of inodes that are written with fallocate using
>>> KEEP_SIZE, and then fill them up to their expected size, e2fsck will
>>> potentially complain about a _huge_ number of inodes.
>> Probably e2fsck also shouldn't complain if EOFBLOCKS_FL is set, but
>> the i_size is within the range implied by i_blocks.
> 
> My current thinking is to have an EOFBLOCKS_relaxed mode setting in
> /etc/e2fsck.conf which controls whether we test for this case or not.
> Technically it *is* an error, but if there are file systems with a
> large number of files in this state, running e2fsck could take a
> ***very*** long time (potentially, hours longer than would otherwise
> be expected).  Hopefully once the bug fix gets pushed out, eventually
> we'll be able to turn this feature off.  (Where eventually might be a
> year or two, given that it's probably too late to get this fixed in
> RHEL 6.  :-( )

Oh it can get fixed during rhel6's lifetime, certainly.  :)

Regarding a conf file setting, I'd really rather not have another knob
that is non-obvious to the user.

Maybe e2fsck could tally these and after I dunno, 10 or 20 or so, ask
whether it should keep flagging them or just go into "yes" mode for
the rest of the inodes with that problem?

-Eric

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: buggy EOFBLOCKS_FL handling
  2010-08-19 17:03     ` Eric Sandeen
@ 2010-08-19 17:11       ` Ted Ts'o
  2010-08-19 18:33         ` Andreas Dilger
  0 siblings, 1 reply; 13+ messages in thread
From: Ted Ts'o @ 2010-08-19 17:11 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: Andreas Dilger, linux-ext4

On Thu, Aug 19, 2010 at 12:03:17PM -0500, Eric Sandeen wrote:
> 
> Maybe e2fsck could tally these and after I dunno, 10 or 20 or so, ask
> whether it should keep flagging them or just go into "yes" mode for
> the rest of the inodes with that problem?

Maybe.  I'd need to do some testing to see what percentage of the
"takes hours longer" is caused by needing to fix truly vast numbers of
inodes, versus the fact that writing the e2fsck log file was taking a
huge amount of time.  I'm not sure, asking the user, "I've tried
fixing 100 of these inodes, and it looks like there are runs more,
want to skip checking for the rest" is all that great (i.e., a "go
into automatic 'no' mode for this question").

The other possibility is that I'd make it configurable by e2fsck.conf,
but change the default to be "ignore this fs error", and then 2-3
years later, change the default to "check for this fs error", without
actually requiring most users to have a knob in their e2fsck.conf
file.

							- Ted

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: buggy EOFBLOCKS_FL handling
  2010-08-19 17:11       ` Ted Ts'o
@ 2010-08-19 18:33         ` Andreas Dilger
  0 siblings, 0 replies; 13+ messages in thread
From: Andreas Dilger @ 2010-08-19 18:33 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: Eric Sandeen, linux-ext4

On 2010-08-19, at 11:11, Ted Ts'o wrote:
> On Thu, Aug 19, 2010 at 12:03:17PM -0500, Eric Sandeen wrote:
>> 
>> Maybe e2fsck could tally these and after I dunno, 10 or 20 or so, ask
>> whether it should keep flagging them or just go into "yes" mode for
>> the rest of the inodes with that problem?
> 
> Maybe.  I'd need to do some testing to see what percentage of the
> "takes hours longer" is caused by needing to fix truly vast numbers of
> inodes, versus the fact that writing the e2fsck log file was taking a
> huge amount of time.  I'm not sure, asking the user, "I've tried
> fixing 100 of these inodes, and it looks like there are runs more,
> want to skip checking for the rest" is all that great (i.e., a "go
> into automatic 'no' mode for this question").
> 
> The other possibility is that I'd make it configurable by e2fsck.conf,
> but change the default to be "ignore this fs error", and then 2-3
> years later, change the default to "check for this fs error", without
> actually requiring most users to have a knob in their e2fsck.conf
> file.

To me this falls into the class of "silently fix our mistake" kind of problem, similar to what we did in the Lustre e2fsprogs with the extent "_hi" field not being initialized in early versions of the extent patch.

If the slowdown is due to actually updating thousands of such inodes (vs. just printing the error on the screen) you could cap the number of inodes fixed for this problem at some limit, and then every time a full e2fsck is run it would fix a bunch more.

Presumably an updated kernel and regular filesystem usage would also tend to clean up this flag if old files are deleted and new ones created, but it is good to have e2fsck do it also, for files that are not modified for a long time.

Cheers, Andreas






^ permalink raw reply	[flat|nested] 13+ messages in thread

* Updated test case
  2010-08-19  3:01 buggy EOFBLOCKS_FL handling Theodore Ts'o
  2010-08-19  3:04 ` [PATCH, RFC] ext4: Fix " Theodore Ts'o
  2010-08-19  5:13 ` buggy " Andreas Dilger
@ 2010-08-21 20:11 ` Ted Ts'o
  2010-08-22  0:40   ` Eric Sandeen
  2 siblings, 1 reply; 13+ messages in thread
From: Ted Ts'o @ 2010-08-21 20:11 UTC (permalink / raw)
  To: linux-ext4

This is an updated of my test program, with a new option that allows
us to seek to the middle of the unitialized extent.  This allows us to
test more cases.  Suggested test cases:

./testcase
./testcase -w 8192
./testcase -s 8192 -w 8192
./testcase -s 8192
./testcase -s 409600
./testcase -d
./testcase -d -w 8192
./testcase -d -s 8192
./testcase -d -s 8192 -w 8192
./testcase -d -s 409600

What I normally do is run it something like this:

mount /scratch ; pushd /scratch; ~/testcase <opts>; popd ; umount /scratch ; debugfs /dev/sdc1 -R "stat test-file"

What to look for is whether the flags field is either 0x480000 or
0x80000.  The 0x400000 flag is the EOFBLOCKS_FL flag.  If last extent
is uninitialized, then the EOFBLOCKS_FL flag should be set.  If the
last extent does not have the "uninit" flag, then the EOFBLOCKS_FL
should be clear.

       	  					- Ted

/*
 * Testcase for Google Bug 2928259
 *
 * Run this program while the current directory is in an ext4 filesystem,
 * then umount the file system and do a forced fsck (i.e., fsck -f /dev/XXX).
 *
 * If you get a e2fsck reported corruption, then the kernel is buggy:
 *
 * Inode 12 should not have EOFBLOCKS_FL set (size 40960, lblk 9)
 * Clear<y>? yes
 */

#define _GNU_SOURCE

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <getopt.h>
#include <errno.h>

#define FALLOC_FL_KEEP_SIZE     0x01

#ifndef SYS_fallocate
#ifdef __i386__
/* 32-bits */
#define SYS_fallocate 324
#elif __amd64__
/* 64-bits */
#define SYS_fallocate 285
#endif
#endif

int main(int argc, char **argv)
{
	int fd, ret, c;
	char *buf, *tmp;
	unsigned long fsize = 40960;
	unsigned long wsize = 40960;
	off_t offset = 0;
	struct stat st;
	int flags = O_CREAT|O_TRUNC|O_RDWR;

	while ((c = getopt(argc, argv, "df:s:w:")) != EOF) {
		switch (c) {
		case 'd':
			flags |= O_DIRECT;
			break;
		case 'f':
			fsize = strtoul(optarg, &tmp, 0);
			if (*tmp) {
				fprintf(stderr, "Bad fsize - %s\n", optarg);
				exit(1);
			}
			break;
		case 's':
			offset = strtol(optarg, &tmp, 0);
			if (*tmp) {
				fprintf(stderr, "Bad offset - %s\n", optarg);
				exit(1);
			}
			break;
		case 'w':
			wsize = strtoul(optarg, &tmp, 0);
			if (*tmp) {
				fprintf(stderr, "Bad wsize - %s\n", optarg);
				exit(1);
			}
			break;
		default:
			fprintf(stderr, "Usage: testcase [-d] "
				"-f fallocate_size -w write_size\n");
		}
	}

	fd = open("test-file", flags, 0644);
	if (fd < 0) {
		perror("open");
		exit(1);
	}
	ret = syscall(SYS_fallocate, fd, FALLOC_FL_KEEP_SIZE, 0ULL,
		      (unsigned long long) fsize);
	if (ret) {
		perror("fallocate");
		exit(1);
	}
	if ((ret = posix_memalign((void **) &buf, 4096, wsize)) != 0) {
		errno = ret;
		perror("posix_memalign");
	}
	if (lseek(fd, offset, SEEK_SET) < 0) {
		perror("lseek");
		exit(1);
	}
	memset(buf, 255, wsize);
	ret = write(fd, buf, wsize);
	if (ret < 0) {
		perror("write");
		exit(1);
	} else if (ret != wsize) {
		fprintf(stderr, "Short write: actual %d, expected %lu\n",
			ret, wsize);
		exit(1);
	}
	if (fstat(fd, &st) < 0) {
		perror("fstat");
		exit(1);
	}
	printf("test-file has inode number %lu\n", (unsigned long) st.st_ino);
	printf("size is %lu, blocks*512 is %lu\n", (unsigned long) st.st_size,
	       (unsigned long) st.st_blocks*512);
	close(fd);
	exit(0);
}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH -v2] ext4: Fix EOFBLOCKS_FL handling
  2010-08-19  3:04 ` [PATCH, RFC] ext4: Fix " Theodore Ts'o
@ 2010-08-21 21:07   ` Theodore Ts'o
  0 siblings, 0 replies; 13+ messages in thread
From: Theodore Ts'o @ 2010-08-21 21:07 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: Theodore Ts'o

It turns out we have several problems with how EOFBLOCKS_FL is
handled.  First of all, there was a fencepost error where we were not
clearing the EOFBLOCKS_FL when fill in the last uninitialized block,
but rather when we allocate the next block _after_ the uninitalized
block.  Secondly we were not testing to see if we needed to clear the
EOFBLOCKS_FL when writing to the file O_DIRECT or when were converting
an uninitialized block (which is the most common case).

Google-Bug-Id: 2928259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |   91 ++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 06328d3..98797e2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3180,6 +3180,50 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
                 unmap_underlying_metadata(bdev, block + i);
 }
 
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+			      struct ext4_map_blocks *map,
+			      struct ext4_ext_path *path,
+			      unsigned int len)
+{
+	int i, depth;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex, *last_ex;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		return 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	if (unlikely(!eh->eh_entries)) {
+		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+				 "EOFBLOCKS_FL set");
+		return -EIO;
+	}
+	last_ex = EXT_LAST_EXTENT(eh);
+	/*
+	 * If the current leaf block was reached by looking at the
+	 * last index block all the way down the tree, and we are
+	 * extending the inode beyond the last extent in the current
+	 * leaf block, then clear the EOFBLOCKS_FL flag.
+	 */
+	if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+	    ext4_ext_get_actual_len(last_ex))
+		return 0;
+	for (i = depth-1; i >= 0; i--) {
+		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+			break;
+	}
+	if (i >= 0)
+		return 0;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	return ext4_mark_inode_dirty(handle, inode);
+}
+
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			struct ext4_map_blocks *map,
@@ -3217,8 +3261,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
 							path);
-		if (ret >= 0)
+		if (ret >= 0) {
 			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, map, path,
+						 map->m_len);
+		} else
+			err = ret;
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3244,8 +3292,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	/* buffered write, writepage time, convert*/
 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-	if (ret >= 0)
+	if (ret >= 0) {
 		ext4_update_inode_fsync_trans(handle, inode, 1);
+		err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+		if (err < 0)
+			goto out2;
+	}
+
 out:
 	if (ret <= 0) {
 		err = ret;
@@ -3292,6 +3345,7 @@ out2:
 	}
 	return err ? err : allocated;
 }
+
 /*
  * Block allocation/map/preallocation routine for extents based files
  *
@@ -3315,9 +3369,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
-	struct ext4_extent newex, *ex, *last_ex;
+	struct ext4_extent newex, *ex;
 	ext4_fsblk_t newblock;
-	int i, err = 0, depth, ret, cache_type;
+	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3497,31 +3551,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
-		if (unlikely(!eh->eh_entries)) {
-			EXT4_ERROR_INODE(inode,
-					 "eh->eh_entries == 0 and "
-					 "EOFBLOCKS_FL set");
-			err = -EIO;
-			goto out2;
-		}
-		last_ex = EXT_LAST_EXTENT(eh);
-		/*
-		 * If the current leaf block was reached by looking at
-		 * the last index block all the way down the tree, and
-		 * we are extending the inode beyond the last extent
-		 * in the current leaf block, then clear the
-		 * EOFBLOCKS_FL flag.
-		 */
-		for (i = depth-1; i >= 0; i--) {
-			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-				break;
-		}
-		if ((i < 0) &&
-		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-		     ext4_ext_get_actual_len(last_ex)))
-			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-	}
+	err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+	if (err)
+		goto out2;
+
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
 		/* free data blocks we just allocated */
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: Updated test case
  2010-08-21 20:11 ` Updated test case Ted Ts'o
@ 2010-08-22  0:40   ` Eric Sandeen
  2010-08-22 11:42     ` Ted Ts'o
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Sandeen @ 2010-08-22  0:40 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: linux-ext4

Ted Ts'o wrote:
> This is an updated of my test program, with a new option that allows
> us to seek to the middle of the unitialized extent.  This allows us to
> test more cases.  Suggested test cases:
> 
> ./testcase
> ./testcase -w 8192
> ./testcase -s 8192 -w 8192
> ./testcase -s 8192
> ./testcase -s 409600
> ./testcase -d
> ./testcase -d -w 8192
> ./testcase -d -s 8192
> ./testcase -d -s 8192 -w 8192
> ./testcase -d -s 409600

This can be done easily with the xfs_io tool:

# Buffered writes
xfs_io    -f -F -c "falloc -k 0 40960" -c "pwrite     0 40960" eof1
xfs_io    -f -F -c "falloc -k 0 40960" -c "pwrite  8192  8192" eof2
xfs_io    -f -F -c "falloc -k 0 40960" -c "pwrite  8192 40960" eof3
xfs_io    -f -F -c "falloc -k 0 40960" -c "pwrite 40960 40960" eof4
# O_DIRECT writes
xfs_io -d -f -F -c "falloc -k 0 40960" -c "pwrite     0 40960" eof5
xfs_io -d -f -F -c "falloc -k 0 40960" -c "pwrite  8192  8192" eof6
xfs_io -d -f -F -c "falloc -k 0 40960" -c "pwrite  8192 40960" eof7
xfs_io -d -f -F -c "falloc -k 0 40960" -c "pwrite 40960 40960" eof8

and if we want this to stay fixed, it needs to be part of a test suite.

I'll send an xfstest but it'd be really great if could could work
inside the xfstests framework when devising testcases...

Ted, is just checking for fs corruption is enough or do you think a
test needs the debugfs stat inspection step?  It'd be easy enough
to special-case a debugfs step for ext4.

> What I normally do is run it something like this:
> 
> mount /scratch ; pushd /scratch; ~/testcase <opts>; popd ; umount /scratch ; debugfs /dev/sdc1 -R "stat test-file"
> 
> What to look for is whether the flags field is either 0x480000 or
> 0x80000.  The 0x400000 flag is the EOFBLOCKS_FL flag.  If last extent
> is uninitialized, then the EOFBLOCKS_FL flag should be set.  

only if that last extent is past i_size, though...

> If the
> last extent does not have the "uninit" flag, then the EOFBLOCKS_FL
> should be clear.

And the above should always be true.

Thanks,
-Eric



>        	  					- Ted
> 
> /*
>  * Testcase for Google Bug 2928259
>  *
>  * Run this program while the current directory is in an ext4 filesystem,
>  * then umount the file system and do a forced fsck (i.e., fsck -f /dev/XXX).
>  *
>  * If you get a e2fsck reported corruption, then the kernel is buggy:
>  *
>  * Inode 12 should not have EOFBLOCKS_FL set (size 40960, lblk 9)
>  * Clear<y>? yes
>  */
> 
> #define _GNU_SOURCE
> 
> #include <stdio.h>
> #include <unistd.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/types.h>
> #include <sys/syscall.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <getopt.h>
> #include <errno.h>
> 
> #define FALLOC_FL_KEEP_SIZE     0x01
> 
> #ifndef SYS_fallocate
> #ifdef __i386__
> /* 32-bits */
> #define SYS_fallocate 324
> #elif __amd64__
> /* 64-bits */
> #define SYS_fallocate 285
> #endif
> #endif
> 
> int main(int argc, char **argv)
> {
> 	int fd, ret, c;
> 	char *buf, *tmp;
> 	unsigned long fsize = 40960;
> 	unsigned long wsize = 40960;
> 	off_t offset = 0;
> 	struct stat st;
> 	int flags = O_CREAT|O_TRUNC|O_RDWR;
> 
> 	while ((c = getopt(argc, argv, "df:s:w:")) != EOF) {
> 		switch (c) {
> 		case 'd':
> 			flags |= O_DIRECT;
> 			break;
> 		case 'f':
> 			fsize = strtoul(optarg, &tmp, 0);
> 			if (*tmp) {
> 				fprintf(stderr, "Bad fsize - %s\n", optarg);
> 				exit(1);
> 			}
> 			break;
> 		case 's':
> 			offset = strtol(optarg, &tmp, 0);
> 			if (*tmp) {
> 				fprintf(stderr, "Bad offset - %s\n", optarg);
> 				exit(1);
> 			}
> 			break;
> 		case 'w':
> 			wsize = strtoul(optarg, &tmp, 0);
> 			if (*tmp) {
> 				fprintf(stderr, "Bad wsize - %s\n", optarg);
> 				exit(1);
> 			}
> 			break;
> 		default:
> 			fprintf(stderr, "Usage: testcase [-d] "
> 				"-f fallocate_size -w write_size\n");
> 		}
> 	}
> 
> 	fd = open("test-file", flags, 0644);
> 	if (fd < 0) {
> 		perror("open");
> 		exit(1);
> 	}
> 	ret = syscall(SYS_fallocate, fd, FALLOC_FL_KEEP_SIZE, 0ULL,
> 		      (unsigned long long) fsize);
> 	if (ret) {
> 		perror("fallocate");
> 		exit(1);
> 	}
> 	if ((ret = posix_memalign((void **) &buf, 4096, wsize)) != 0) {
> 		errno = ret;
> 		perror("posix_memalign");
> 	}
> 	if (lseek(fd, offset, SEEK_SET) < 0) {
> 		perror("lseek");
> 		exit(1);
> 	}
> 	memset(buf, 255, wsize);
> 	ret = write(fd, buf, wsize);
> 	if (ret < 0) {
> 		perror("write");
> 		exit(1);
> 	} else if (ret != wsize) {
> 		fprintf(stderr, "Short write: actual %d, expected %lu\n",
> 			ret, wsize);
> 		exit(1);
> 	}
> 	if (fstat(fd, &st) < 0) {
> 		perror("fstat");
> 		exit(1);
> 	}
> 	printf("test-file has inode number %lu\n", (unsigned long) st.st_ino);
> 	printf("size is %lu, blocks*512 is %lu\n", (unsigned long) st.st_size,
> 	       (unsigned long) st.st_blocks*512);
> 	close(fd);
> 	exit(0);
> }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Updated test case
  2010-08-22  0:40   ` Eric Sandeen
@ 2010-08-22 11:42     ` Ted Ts'o
  2010-08-22 15:35       ` Eric Sandeen
  2010-08-23 18:05       ` Andreas Dilger
  0 siblings, 2 replies; 13+ messages in thread
From: Ted Ts'o @ 2010-08-22 11:42 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: linux-ext4

On Sat, Aug 21, 2010 at 07:40:10PM -0500, Eric Sandeen wrote:
> I'll send an xfstest but it'd be really great if could could work
> inside the xfstests framework when devising testcases...

If you could put together an xfstests, that would be great.  I hadn't
because Mike's been trying to remind me that I really need to delegate
to others :-), and we do have someone at Google who can put the
xfstest script together.  You can probably do it faster than he can,
though.

I didn't use xfs_io because I don't know how to use it, and because
it's not one of those things which is regularly on our production
machines that we use for testing.  I probably start exploring all of
the things that can be done with it, though!

> Ted, is just checking for fs corruption is enough or do you think a
> test needs the debugfs stat inspection step?  It'd be easy enough
> to special-case a debugfs step for ext4.

Well, if we end up suppressing the EOFBLOCKS_FL test e2fsck (which is
what we've already done as an emergency workaround) we can't count on
e2fsck detecting the problem, which is why I phrased this the way I
did for Aditya's benefit.

> > What I normally do is run it something like this:
> > 
> > mount /scratch ; pushd /scratch; ~/testcase <opts>; popd ; umount /scratch ; debugfs /dev/sdc1 -R "stat test-file"
> > 
> > What to look for is whether the flags field is either 0x480000 or
> > 0x80000.  The 0x400000 flag is the EOFBLOCKS_FL flag.  If last extent
> > is uninitialized, then the EOFBLOCKS_FL flag should be set.  
> 
> only if that last extent is past i_size, though...

Good point, and I guess I did have at least one test case where that
wasn't true.

						- Ted

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Updated test case
  2010-08-22 11:42     ` Ted Ts'o
@ 2010-08-22 15:35       ` Eric Sandeen
  2010-08-23 18:05       ` Andreas Dilger
  1 sibling, 0 replies; 13+ messages in thread
From: Eric Sandeen @ 2010-08-22 15:35 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: linux-ext4

Ted Ts'o wrote:
> On Sat, Aug 21, 2010 at 07:40:10PM -0500, Eric Sandeen wrote:
>> I'll send an xfstest but it'd be really great if could could work
>> inside the xfstests framework when devising testcases...
> 
> If you could put together an xfstests, that would be great.  I hadn't
> because Mike's been trying to remind me that I really need to delegate
> to others :-), and we do have someone at Google who can put the
> xfstest script together.  You can probably do it faster than he can,
> though.

Hah, I'm also supposed to delegate :D Let's see what your person can
come up with, I'd really like to start seeing more people contribute to
the test suite.  I'm happy to answer any questions.

> I didn't use xfs_io because I don't know how to use it, and because
> it's not one of those things which is regularly on our production
> machines that we use for testing.  I probably start exploring all of
> the things that can be done with it, though!

Sure, I know it's kind of an oddball tool, but it's really a good swiss
army knife for creating testcases like this.  Probably faster than
writing C.  :)

>> Ted, is just checking for fs corruption is enough or do you think a
>> test needs the debugfs stat inspection step?  It'd be easy enough
>> to special-case a debugfs step for ext4.
> 
> Well, if we end up suppressing the EOFBLOCKS_FL test e2fsck (which is
> what we've already done as an emergency workaround) we can't count on
> e2fsck detecting the problem, which is why I phrased this the way I
> did for Aditya's benefit.

Ok.  Explicitly exercising blocks-past-EOF on any fallocate-capable
fs is probably a good thing for the test to do, but since ext4 in
particular had a bug, we can always do a debugfs step under
an FSTYP==ext4 case, which is silent on success, and prints out
something on failure (which would change the output and make the
test fail)

-Eric

>>> What I normally do is run it something like this:
>>>
>>> mount /scratch ; pushd /scratch; ~/testcase <opts>; popd ; umount /scratch ; debugfs /dev/sdc1 -R "stat test-file"
>>>
>>> What to look for is whether the flags field is either 0x480000 or
>>> 0x80000.  The 0x400000 flag is the EOFBLOCKS_FL flag.  If last extent
>>> is uninitialized, then the EOFBLOCKS_FL flag should be set.  
>> only if that last extent is past i_size, though...
> 
> Good point, and I guess I did have at least one test case where that
> wasn't true.
> 
> 						- Ted


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Updated test case
  2010-08-22 11:42     ` Ted Ts'o
  2010-08-22 15:35       ` Eric Sandeen
@ 2010-08-23 18:05       ` Andreas Dilger
  1 sibling, 0 replies; 13+ messages in thread
From: Andreas Dilger @ 2010-08-23 18:05 UTC (permalink / raw)
  To: Ted Ts'o; +Cc: Eric Sandeen, linux-ext4

On 2010-08-22, at 05:42, Ted Ts'o wrote:
> On Sat, Aug 21, 2010 at 07:40:10PM -0500, Eric Sandeen wrote:
>> I'll send an xfstest but it'd be really great if could could work
>> inside the xfstests framework when devising testcases...
> 
> If you could put together an xfstests, that would be great.  I hadn't
> because Mike's been trying to remind me that I really need to delegate
> to others :-), and we do have someone at Google who can put the
> xfstest script together.  You can probably do it faster than he can,
> though.

That reminds me that I have a 90% completed version of fsx that has fallocate() calls with both KEEP_SIZE and without.  I was just lacking a suitable place to do testing for it and fix any remaining issues...  If there was someone around who could test this it would be great.

Cheers, Andreas






^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2010-08-23 18:05 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-19  3:01 buggy EOFBLOCKS_FL handling Theodore Ts'o
2010-08-19  3:04 ` [PATCH, RFC] ext4: Fix " Theodore Ts'o
2010-08-21 21:07   ` [PATCH -v2] " Theodore Ts'o
2010-08-19  5:13 ` buggy " Andreas Dilger
2010-08-19 14:44   ` Ted Ts'o
2010-08-19 17:03     ` Eric Sandeen
2010-08-19 17:11       ` Ted Ts'o
2010-08-19 18:33         ` Andreas Dilger
2010-08-21 20:11 ` Updated test case Ted Ts'o
2010-08-22  0:40   ` Eric Sandeen
2010-08-22 11:42     ` Ted Ts'o
2010-08-22 15:35       ` Eric Sandeen
2010-08-23 18:05       ` Andreas Dilger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.