All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] test race when checking i_size on direct i/o read
@ 2017-08-18 20:35 Eric Sandeen
  2017-08-28 10:35 ` Eryu Guan
  2017-09-19  7:36 ` Eryu Guan
  0 siblings, 2 replies; 10+ messages in thread
From: Eric Sandeen @ 2017-08-18 20:35 UTC (permalink / raw)
  To: fstests; +Cc: Zheng Liu, Christoph Hellwig

From: Zheng Liu <wenqing.lz@taobao.com>

In this commit a new test case is added to test that i_size races don't
occur under dio reads/writes.  We add a program in /src dir, which
has a writer to issue some append dio writes.  Meanwhile it has a reader
in this test to do some dio reads.  As we expect, reader should read
nothing or data with 'a'.  But it might read some data with '0'.

The bug can be reproduced by this test case [1].

1.  http://patchwork.ozlabs.org/patch/311761/

This ostensibly tests commit:
9fe55eea7 Fix race when checking i_size on direct i/o read

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Rich Johnston <rjohnston@sgi.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
[sandeen: update to recent xfstests, update commitlog]
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---

This test was originally titled:

 xfstests: add a new test case to test i_size updated properly under dio

but I think the issue is more of when and how it's tested, not how
it's updated.

Note, this passes on xfs on 4.10, but fails on 4.12.
ext4 on 4.10 passes as well but is very slow.

iomap dio maybe?  Not sure yet.

changelog v3:
 * rebase against latest xfstests/master branch
 * update commit log

changelog v2:
 * add '-lpthread' into LLDLIBS

diff --git a/configure.ac b/configure.ac
index 57092f1..4663004 100644
--- a/configure.ac
+++ b/configure.ac
@@ -59,6 +59,7 @@ AC_PACKAGE_NEED_GETXATTR_LIBATTR
 AC_PACKAGE_NEED_SYS_ACL_H
 AC_PACKAGE_NEED_ACL_LIBACL_H
 AC_PACKAGE_NEED_ACLINIT_LIBACL
+AC_PACKAGE_NEED_PTHREADMUTEXINIT
 
 AC_PACKAGE_WANT_GDBM
 AC_PACKAGE_WANT_AIO
diff --git a/include/builddefs.in b/include/builddefs.in
index cb52b99..fcc8b90 100644
--- a/include/builddefs.in
+++ b/include/builddefs.in
@@ -25,6 +25,7 @@ LIBGDBM = @libgdbm@
 LIBUUID = @libuuid@
 LIBHANDLE = @libhdl@
 LIBDM = @libdm@
+LIBPTHREAD = @libpthread@
 LIBTEST = $(TOPDIR)/lib/libtest.la
 prefix = @prefix@
 
diff --git a/src/Makefile b/src/Makefile
index b8aff49..e9419bd 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -23,7 +23,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
 	seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
 	renameat2 t_getcwd e4compact test-nextquota punch-alternating \
 	attr-list-by-handle-cursor-test listxattr dio-interleaved t_dir_type \
-	dio-invalidate-cache stat_test t_encrypted_d_revalidate
+	dio-invalidate-cache stat_test t_encrypted_d_revalidate diotest
 
 SUBDIRS =
 
diff --git a/src/diotest.c b/src/diotest.c
new file mode 100644
index 0000000..7d2378f
--- /dev/null
+++ b/src/diotest.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2013 Alibaba Group.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ * This is a normal case that we do some append dio writes and meanwhile
+ * we do some dio reads.  Currently in vfs we don't ensure that i_size
+ * is updated properly.  Hence the reader will read some data with '0'.
+ * But we expect that the reader should read nothing or data with 'a'.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <pthread.h>
+
+static char *prog;
+
+struct writer_data {
+	int fd;
+	size_t blksize;
+	char *buf;
+};
+
+static void usage(void)
+{
+	fprintf(stderr, "usage: %s [FILE]\n", prog);
+}
+
+static void *writer(void *arg)
+{
+	struct writer_data *data = (struct writer_data *)arg;
+	int ret;
+
+	ret = write(data->fd, data->buf, data->blksize);
+	if (ret < 0)
+		fprintf(stderr, "write file failed: %s\n", strerror(errno));
+
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	pthread_t tid;
+	struct writer_data wdata;
+	size_t max_blocks = 128;		/* 128 */
+	size_t blksize = 1 * 1024 * 1024;	/* 1M */
+	char *rbuf = NULL, *wbuf = NULL;
+	int rfd = 0, wfd = 0;
+	int i, j;
+	int ret = 0;
+
+	prog = basename(argv[0]);
+
+	if (argc != 2) {
+		usage();
+		exit(1);
+	}
+
+	wfd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
+	if (wfd < 0) {
+		fprintf(stderr, "failed to open write file: %s\n",
+			strerror(errno));
+		exit(1);
+	}
+
+	rfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
+	if (wfd < 0) {
+		fprintf(stderr, "failed to open read file: %s\n",
+			strerror(errno));
+		ret = 1;
+		goto err;
+	}
+
+	/*
+	 * We set 1024 as an alignment size for write buf.  Feel free to change
+	 * it with 4096.  But the problem is also hitted.
+	 */
+	if (posix_memalign((void **)&wbuf, 1024, blksize)) {
+		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
+		ret = 1;
+		goto err;
+	}
+
+	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
+		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
+		ret = 1;
+		goto err;
+	}
+
+	memset(wbuf, 'a', blksize);
+	wdata.fd = wfd;
+	wdata.blksize = blksize;
+	wdata.buf = wbuf;
+
+	for (i = 0; i < max_blocks; i++) {
+		void *retval;
+
+		if (pthread_create(&tid, NULL, writer, &wdata)) {
+			fprintf(stderr, "create thread failed: %s\n",
+				strerror(errno));
+			ret = 1;
+			goto err;
+		}
+
+		memset(rbuf, 'b', blksize);
+		do {
+			ret = pread(rfd, rbuf, blksize, i * blksize);
+			if (ret < 0)
+				fprintf(stderr, "read file failed: %s\n",
+					strerror(errno));
+		} while (ret <= 0);
+
+		if (pthread_join(tid, &retval)) {
+			fprintf(stderr, " pthread join failed: %s\n",
+				strerror(errno));
+			ret = 1;
+			goto err;
+		}
+
+		if (ret >= 0) {
+			for (j = 0; j < ret; j ++) {
+				if (rbuf[j] != 'a') {
+					fprintf(stderr, "encounter an error: "
+						"offset %d content %c\n",
+						i, rbuf[j]);
+					ret = 1;
+					goto err;
+				}
+			}
+		}
+	}
+
+err:
+	if (rfd)
+		close(rfd);
+	if (wfd)
+		close(wfd);
+	if (rbuf)
+		free(rbuf);
+	if (wbuf)
+		free(wbuf);
+
+	return ret;
+}
diff --git a/tests/generic/450 b/tests/generic/450
new file mode 100755
index 0000000..cfb424c
--- /dev/null
+++ b/tests/generic/450
@@ -0,0 +1,56 @@
+#! /bin/bash
+# FS QA Test No. 450
+#
+# Test i_size is updated properly under dio read/write
+#
+#-----------------------------------------------------------------------
+# Copyright (c) 2013 Alibaba Group.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#-----------------------------------------------------------------------
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+    cd /
+    rm -f $tmp.* $testfile
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+
+testfile=$TEST_DIR/$seq.$$
+
+[ -x $here/src/diotest ] || _notrun "diotest not built"
+
+$here/src/diotest $testfile # > $seqres.full 2>&1 ||
+	# _fail "i_size isn't update properly!"
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/450.out b/tests/generic/450.out
new file mode 100644
index 0000000..734761a
--- /dev/null
+++ b/tests/generic/450.out
@@ -0,0 +1 @@
+QA output created by 450
diff --git a/tests/generic/group b/tests/generic/group
index b9cd0e8..a555fa0 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -452,3 +452,4 @@
 447 auto quick clone
 448 auto quick rw
 449 auto quick acl enospc
+450 auto rw quick


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-08-18 20:35 [PATCH] test race when checking i_size on direct i/o read Eric Sandeen
@ 2017-08-28 10:35 ` Eryu Guan
  2017-08-29 13:46   ` Nikolay Borisov
  2017-09-19  7:36 ` Eryu Guan
  1 sibling, 1 reply; 10+ messages in thread
From: Eryu Guan @ 2017-08-28 10:35 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: fstests, Zheng Liu, Christoph Hellwig

On Fri, Aug 18, 2017 at 03:35:02PM -0500, Eric Sandeen wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> In this commit a new test case is added to test that i_size races don't
> occur under dio reads/writes.  We add a program in /src dir, which
> has a writer to issue some append dio writes.  Meanwhile it has a reader
> in this test to do some dio reads.  As we expect, reader should read
> nothing or data with 'a'.  But it might read some data with '0'.
> 
> The bug can be reproduced by this test case [1].
> 
> 1.  http://patchwork.ozlabs.org/patch/311761/
> 
> This ostensibly tests commit:
> 9fe55eea7 Fix race when checking i_size on direct i/o read
> 
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: Rich Johnston <rjohnston@sgi.com>
> Cc: Dave Chinner <david@fromorbit.com>
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> [sandeen: update to recent xfstests, update commitlog]
> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> ---
> 
> This test was originally titled:
> 
>  xfstests: add a new test case to test i_size updated properly under dio
> 
> but I think the issue is more of when and how it's tested, not how
> it's updated.
> 
> Note, this passes on xfs on 4.10, but fails on 4.12.
> ext4 on 4.10 passes as well but is very slow.
> 
> iomap dio maybe?  Not sure yet.

It failed for me with XFS and btrfs, ext4 passed test, kernel is
4.13-rc5. But I found that the alignment for write buffer matters too
for reproducing the btrfs failure, see below.

> 
> changelog v3:
>  * rebase against latest xfstests/master branch
>  * update commit log

Thanks for picking up the test again! And sorry that I got to it late..

> 
> changelog v2:
>  * add '-lpthread' into LLDLIBS
> 
> diff --git a/configure.ac b/configure.ac
> index 57092f1..4663004 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -59,6 +59,7 @@ AC_PACKAGE_NEED_GETXATTR_LIBATTR
>  AC_PACKAGE_NEED_SYS_ACL_H
>  AC_PACKAGE_NEED_ACL_LIBACL_H
>  AC_PACKAGE_NEED_ACLINIT_LIBACL
> +AC_PACKAGE_NEED_PTHREADMUTEXINIT
>  
>  AC_PACKAGE_WANT_GDBM
>  AC_PACKAGE_WANT_AIO
> diff --git a/include/builddefs.in b/include/builddefs.in
> index cb52b99..fcc8b90 100644
> --- a/include/builddefs.in
> +++ b/include/builddefs.in
> @@ -25,6 +25,7 @@ LIBGDBM = @libgdbm@
>  LIBUUID = @libuuid@
>  LIBHANDLE = @libhdl@
>  LIBDM = @libdm@
> +LIBPTHREAD = @libpthread@
>  LIBTEST = $(TOPDIR)/lib/libtest.la
>  prefix = @prefix@
>  
> diff --git a/src/Makefile b/src/Makefile
> index b8aff49..e9419bd 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -23,7 +23,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
>  	seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
>  	renameat2 t_getcwd e4compact test-nextquota punch-alternating \
>  	attr-list-by-handle-cursor-test listxattr dio-interleaved t_dir_type \
> -	dio-invalidate-cache stat_test t_encrypted_d_revalidate
> +	dio-invalidate-cache stat_test t_encrypted_d_revalidate diotest

I think a more specific name would be better than just "diotest" :)

>  
>  SUBDIRS =
>  
> diff --git a/src/diotest.c b/src/diotest.c
> new file mode 100644
> index 0000000..7d2378f
> --- /dev/null
> +++ b/src/diotest.c
> @@ -0,0 +1,166 @@
> +/*
> + * Copyright (c) 2013 Alibaba Group.
> + * All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it would be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write the Free Software Foundation,
> + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> + */
> +
> +/*
> + * This is a normal case that we do some append dio writes and meanwhile
> + * we do some dio reads.  Currently in vfs we don't ensure that i_size
> + * is updated properly.  Hence the reader will read some data with '0'.
> + * But we expect that the reader should read nothing or data with 'a'.
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#include <unistd.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <errno.h>
> +
> +#include <pthread.h>
> +
> +static char *prog;
> +
> +struct writer_data {
> +	int fd;
> +	size_t blksize;
> +	char *buf;
> +};
> +
> +static void usage(void)
> +{
> +	fprintf(stderr, "usage: %s [FILE]\n", prog);
> +}
> +
> +static void *writer(void *arg)
> +{
> +	struct writer_data *data = (struct writer_data *)arg;
> +	int ret;
> +
> +	ret = write(data->fd, data->buf, data->blksize);
> +	if (ret < 0)
> +		fprintf(stderr, "write file failed: %s\n", strerror(errno));
> +
> +	return NULL;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	pthread_t tid;
> +	struct writer_data wdata;
> +	size_t max_blocks = 128;		/* 128 */
> +	size_t blksize = 1 * 1024 * 1024;	/* 1M */
> +	char *rbuf = NULL, *wbuf = NULL;
> +	int rfd = 0, wfd = 0;
> +	int i, j;
> +	int ret = 0;
> +
> +	prog = basename(argv[0]);
> +
> +	if (argc != 2) {
> +		usage();
> +		exit(1);
> +	}
> +
> +	wfd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
> +	if (wfd < 0) {
> +		fprintf(stderr, "failed to open write file: %s\n",
> +			strerror(errno));
> +		exit(1);
> +	}
> +
> +	rfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
> +	if (wfd < 0) {
> +		fprintf(stderr, "failed to open read file: %s\n",
> +			strerror(errno));
> +		ret = 1;
> +		goto err;
> +	}
> +
> +	/*
> +	 * We set 1024 as an alignment size for write buf.  Feel free to change
> +	 * it with 4096.  But the problem is also hitted.
> +	 */
> +	if (posix_memalign((void **)&wbuf, 1024, blksize)) {

I suspect that a hardcoded 1024 alignment won't work for 4k sector
device, but, as I mentioned above, changing it to 4096 made test on
btrfs pass, even the comment said it didn't matter.

How about passing the alignment requirement as an argument from the
shell script? This can be looked up with _min_dio_alignment.

> +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
> +		ret = 1;
> +		goto err;
> +	}
> +
> +	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
> +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
> +		ret = 1;
> +		goto err;
> +	}
> +
> +	memset(wbuf, 'a', blksize);
> +	wdata.fd = wfd;
> +	wdata.blksize = blksize;
> +	wdata.buf = wbuf;
> +
> +	for (i = 0; i < max_blocks; i++) {
> +		void *retval;
> +
> +		if (pthread_create(&tid, NULL, writer, &wdata)) {
> +			fprintf(stderr, "create thread failed: %s\n",
> +				strerror(errno));
> +			ret = 1;
> +			goto err;
> +		}
> +
> +		memset(rbuf, 'b', blksize);
> +		do {
> +			ret = pread(rfd, rbuf, blksize, i * blksize);
> +			if (ret < 0)
> +				fprintf(stderr, "read file failed: %s\n",
> +					strerror(errno));
> +		} while (ret <= 0);
> +
> +		if (pthread_join(tid, &retval)) {
> +			fprintf(stderr, " pthread join failed: %s\n",
> +				strerror(errno));
> +			ret = 1;
> +			goto err;
> +		}
> +
> +		if (ret >= 0) {
> +			for (j = 0; j < ret; j ++) {
> +				if (rbuf[j] != 'a') {
> +					fprintf(stderr, "encounter an error: "
> +						"offset %d content %c\n",
> +						i, rbuf[j]);

This prints a binary zero on failure, and makes diff harder to view.

Binary files tests/generic/452.out and /root/workspace/xfstests/results//xfs_4k/generic/452.out.bad differ

I changed it a bit:
-                                               "offset %d content %c\n",
-                                               i, rbuf[j]);
+                                               "block %d offset %d, content 0x%x\n",
+                                               i, j, rbuf[j]);

and the result looked fine to me:
+encounter an error: block 15 offset 0, content 0x0

> +					ret = 1;
> +					goto err;
> +				}
> +			}
> +		}
> +	}
> +
> +err:
> +	if (rfd)
> +		close(rfd);
> +	if (wfd)
> +		close(wfd);
> +	if (rbuf)
> +		free(rbuf);
> +	if (wbuf)
> +		free(wbuf);
> +
> +	return ret;
> +}
> diff --git a/tests/generic/450 b/tests/generic/450
> new file mode 100755
> index 0000000..cfb424c
> --- /dev/null
> +++ b/tests/generic/450
> @@ -0,0 +1,56 @@
> +#! /bin/bash
> +# FS QA Test No. 450
> +#
> +# Test i_size is updated properly under dio read/write
> +#
> +#-----------------------------------------------------------------------
> +# Copyright (c) 2013 Alibaba Group.  All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#-----------------------------------------------------------------------
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1	# failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +    cd /
> +    rm -f $tmp.* $testfile
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +
> +# real QA test starts here
> +_supported_fs generic
> +_supported_os Linux
> +
> +testfile=$TEST_DIR/$seq.$$
> +
> +[ -x $here/src/diotest ] || _notrun "diotest not built"

_require_test_program "diotest" or a new name :)

And need a _require_odirect too.

> +
> +$here/src/diotest $testfile # > $seqres.full 2>&1 ||
> +	# _fail "i_size isn't update properly!"

All the comments can be removed :)

Thanks,
Eryu

> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/450.out b/tests/generic/450.out
> new file mode 100644
> index 0000000..734761a
> --- /dev/null
> +++ b/tests/generic/450.out
> @@ -0,0 +1 @@
> +QA output created by 450
> diff --git a/tests/generic/group b/tests/generic/group
> index b9cd0e8..a555fa0 100644
> --- a/tests/generic/group
> +++ b/tests/generic/group
> @@ -452,3 +452,4 @@
>  447 auto quick clone
>  448 auto quick rw
>  449 auto quick acl enospc
> +450 auto rw quick
> 
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-08-28 10:35 ` Eryu Guan
@ 2017-08-29 13:46   ` Nikolay Borisov
  0 siblings, 0 replies; 10+ messages in thread
From: Nikolay Borisov @ 2017-08-29 13:46 UTC (permalink / raw)
  To: Eryu Guan, Eric Sandeen; +Cc: fstests, Zheng Liu, Christoph Hellwig



On 28.08.2017 13:35, Eryu Guan wrote:
> On Fri, Aug 18, 2017 at 03:35:02PM -0500, Eric Sandeen wrote:
>> From: Zheng Liu <wenqing.lz@taobao.com>
>>
>> In this commit a new test case is added to test that i_size races don't
>> occur under dio reads/writes.  We add a program in /src dir, which
>> has a writer to issue some append dio writes.  Meanwhile it has a reader
>> in this test to do some dio reads.  As we expect, reader should read
>> nothing or data with 'a'.  But it might read some data with '0'.
>>
>> The bug can be reproduced by this test case [1].
>>
>> 1.  http://patchwork.ozlabs.org/patch/311761/
>>
>> This ostensibly tests commit:
>> 9fe55eea7 Fix race when checking i_size on direct i/o read
>>
>> Cc: Christoph Hellwig <hch@infradead.org>
>> Cc: Rich Johnston <rjohnston@sgi.com>
>> Cc: Dave Chinner <david@fromorbit.com>
>> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
>> [sandeen: update to recent xfstests, update commitlog]
>> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
>> ---
>>
>> This test was originally titled:
>>
>>  xfstests: add a new test case to test i_size updated properly under dio
>>
>> but I think the issue is more of when and how it's tested, not how
>> it's updated.
>>
>> Note, this passes on xfs on 4.10, but fails on 4.12.
>> ext4 on 4.10 passes as well but is very slow.
>>
>> iomap dio maybe?  Not sure yet.
> 
> It failed for me with XFS and btrfs, ext4 passed test, kernel is
> 4.13-rc5. But I found that the alignment for write buffer matters too
> for reproducing the btrfs failure, see below.

In btrfs, if the buffers are not aligned to pagesize, then we fall back
to buffered write, so that's likely why.
> 
>>
>> changelog v3:
>>  * rebase against latest xfstests/master branch
>>  * update commit log
> 
> Thanks for picking up the test again! And sorry that I got to it late..
> 
>>
>> changelog v2:
>>  * add '-lpthread' into LLDLIBS
>>
>> diff --git a/configure.ac b/configure.ac
>> index 57092f1..4663004 100644
>> --- a/configure.ac
>> +++ b/configure.ac
>> @@ -59,6 +59,7 @@ AC_PACKAGE_NEED_GETXATTR_LIBATTR
>>  AC_PACKAGE_NEED_SYS_ACL_H
>>  AC_PACKAGE_NEED_ACL_LIBACL_H
>>  AC_PACKAGE_NEED_ACLINIT_LIBACL
>> +AC_PACKAGE_NEED_PTHREADMUTEXINIT
>>  
>>  AC_PACKAGE_WANT_GDBM
>>  AC_PACKAGE_WANT_AIO
>> diff --git a/include/builddefs.in b/include/builddefs.in
>> index cb52b99..fcc8b90 100644
>> --- a/include/builddefs.in
>> +++ b/include/builddefs.in
>> @@ -25,6 +25,7 @@ LIBGDBM = @libgdbm@
>>  LIBUUID = @libuuid@
>>  LIBHANDLE = @libhdl@
>>  LIBDM = @libdm@
>> +LIBPTHREAD = @libpthread@
>>  LIBTEST = $(TOPDIR)/lib/libtest.la
>>  prefix = @prefix@
>>  
>> diff --git a/src/Makefile b/src/Makefile
>> index b8aff49..e9419bd 100644
>> --- a/src/Makefile
>> +++ b/src/Makefile
>> @@ -23,7 +23,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
>>  	seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
>>  	renameat2 t_getcwd e4compact test-nextquota punch-alternating \
>>  	attr-list-by-handle-cursor-test listxattr dio-interleaved t_dir_type \
>> -	dio-invalidate-cache stat_test t_encrypted_d_revalidate
>> +	dio-invalidate-cache stat_test t_encrypted_d_revalidate diotest
> 
> I think a more specific name would be better than just "diotest" :)
> 
>>  
>>  SUBDIRS =
>>  
>> diff --git a/src/diotest.c b/src/diotest.c
>> new file mode 100644
>> index 0000000..7d2378f
>> --- /dev/null
>> +++ b/src/diotest.c
>> @@ -0,0 +1,166 @@
>> +/*
>> + * Copyright (c) 2013 Alibaba Group.
>> + * All Rights Reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License as
>> + * published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it would be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write the Free Software Foundation,
>> + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>> + */
>> +
>> +/*
>> + * This is a normal case that we do some append dio writes and meanwhile
>> + * we do some dio reads.  Currently in vfs we don't ensure that i_size
>> + * is updated properly.  Hence the reader will read some data with '0'.
>> + * But we expect that the reader should read nothing or data with 'a'.
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +
>> +#include <unistd.h>
>> +#include <sys/types.h>
>> +#include <sys/stat.h>
>> +#include <fcntl.h>
>> +#include <errno.h>
>> +
>> +#include <pthread.h>
>> +
>> +static char *prog;
>> +
>> +struct writer_data {
>> +	int fd;
>> +	size_t blksize;
>> +	char *buf;
>> +};
>> +
>> +static void usage(void)
>> +{
>> +	fprintf(stderr, "usage: %s [FILE]\n", prog);
>> +}
>> +
>> +static void *writer(void *arg)
>> +{
>> +	struct writer_data *data = (struct writer_data *)arg;
>> +	int ret;
>> +
>> +	ret = write(data->fd, data->buf, data->blksize);
>> +	if (ret < 0)
>> +		fprintf(stderr, "write file failed: %s\n", strerror(errno));
>> +
>> +	return NULL;
>> +}
>> +
>> +int main(int argc, char *argv[])
>> +{
>> +	pthread_t tid;
>> +	struct writer_data wdata;
>> +	size_t max_blocks = 128;		/* 128 */
>> +	size_t blksize = 1 * 1024 * 1024;	/* 1M */
>> +	char *rbuf = NULL, *wbuf = NULL;
>> +	int rfd = 0, wfd = 0;
>> +	int i, j;
>> +	int ret = 0;
>> +
>> +	prog = basename(argv[0]);
>> +
>> +	if (argc != 2) {
>> +		usage();
>> +		exit(1);
>> +	}
>> +
>> +	wfd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
>> +	if (wfd < 0) {
>> +		fprintf(stderr, "failed to open write file: %s\n",
>> +			strerror(errno));
>> +		exit(1);
>> +	}
>> +
>> +	rfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
>> +	if (wfd < 0) {
>> +		fprintf(stderr, "failed to open read file: %s\n",
>> +			strerror(errno));
>> +		ret = 1;
>> +		goto err;
>> +	}
>> +
>> +	/*
>> +	 * We set 1024 as an alignment size for write buf.  Feel free to change
>> +	 * it with 4096.  But the problem is also hitted.
>> +	 */
>> +	if (posix_memalign((void **)&wbuf, 1024, blksize)) {
> 
> I suspect that a hardcoded 1024 alignment won't work for 4k sector
> device, but, as I mentioned above, changing it to 4096 made test on
> btrfs pass, even the comment said it didn't matter.
> 
> How about passing the alignment requirement as an argument from the
> shell script? This can be looked up with _min_dio_alignment.
> 
>> +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
>> +		ret = 1;
>> +		goto err;
>> +	}
>> +
>> +	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
>> +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
>> +		ret = 1;
>> +		goto err;
>> +	}
>> +
>> +	memset(wbuf, 'a', blksize);
>> +	wdata.fd = wfd;
>> +	wdata.blksize = blksize;
>> +	wdata.buf = wbuf;
>> +
>> +	for (i = 0; i < max_blocks; i++) {
>> +		void *retval;
>> +
>> +		if (pthread_create(&tid, NULL, writer, &wdata)) {
>> +			fprintf(stderr, "create thread failed: %s\n",
>> +				strerror(errno));
>> +			ret = 1;
>> +			goto err;
>> +		}
>> +
>> +		memset(rbuf, 'b', blksize);
>> +		do {
>> +			ret = pread(rfd, rbuf, blksize, i * blksize);
>> +			if (ret < 0)
>> +				fprintf(stderr, "read file failed: %s\n",
>> +					strerror(errno));
>> +		} while (ret <= 0);
>> +
>> +		if (pthread_join(tid, &retval)) {
>> +			fprintf(stderr, " pthread join failed: %s\n",
>> +				strerror(errno));
>> +			ret = 1;
>> +			goto err;
>> +		}
>> +
>> +		if (ret >= 0) {
>> +			for (j = 0; j < ret; j ++) {
>> +				if (rbuf[j] != 'a') {
>> +					fprintf(stderr, "encounter an error: "
>> +						"offset %d content %c\n",
>> +						i, rbuf[j]);
> 
> This prints a binary zero on failure, and makes diff harder to view.
> 
> Binary files tests/generic/452.out and /root/workspace/xfstests/results//xfs_4k/generic/452.out.bad differ
> 
> I changed it a bit:
> -                                               "offset %d content %c\n",
> -                                               i, rbuf[j]);
> +                                               "block %d offset %d, content 0x%x\n",
> +                                               i, j, rbuf[j]);
> 
> and the result looked fine to me:
> +encounter an error: block 15 offset 0, content 0x0
> 
>> +					ret = 1;
>> +					goto err;
>> +				}
>> +			}
>> +		}
>> +	}
>> +
>> +err:
>> +	if (rfd)
>> +		close(rfd);
>> +	if (wfd)
>> +		close(wfd);
>> +	if (rbuf)
>> +		free(rbuf);
>> +	if (wbuf)
>> +		free(wbuf);
>> +
>> +	return ret;
>> +}
>> diff --git a/tests/generic/450 b/tests/generic/450
>> new file mode 100755
>> index 0000000..cfb424c
>> --- /dev/null
>> +++ b/tests/generic/450
>> @@ -0,0 +1,56 @@
>> +#! /bin/bash
>> +# FS QA Test No. 450
>> +#
>> +# Test i_size is updated properly under dio read/write
>> +#
>> +#-----------------------------------------------------------------------
>> +# Copyright (c) 2013 Alibaba Group.  All Rights Reserved.
>> +#
>> +# This program is free software; you can redistribute it and/or
>> +# modify it under the terms of the GNU General Public License as
>> +# published by the Free Software Foundation.
>> +#
>> +# This program is distributed in the hope that it would be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program; if not, write the Free Software Foundation,
>> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>> +#-----------------------------------------------------------------------
>> +#
>> +
>> +seq=`basename $0`
>> +seqres=$RESULT_DIR/$seq
>> +echo "QA output created by $seq"
>> +
>> +here=`pwd`
>> +tmp=/tmp/$$
>> +status=1	# failure is the default!
>> +trap "_cleanup; exit \$status" 0 1 2 3 15
>> +
>> +_cleanup()
>> +{
>> +    cd /
>> +    rm -f $tmp.* $testfile
>> +}
>> +
>> +# get standard environment, filters and checks
>> +. ./common/rc
>> +. ./common/filter
>> +
>> +# real QA test starts here
>> +_supported_fs generic
>> +_supported_os Linux
>> +
>> +testfile=$TEST_DIR/$seq.$$
>> +
>> +[ -x $here/src/diotest ] || _notrun "diotest not built"
> 
> _require_test_program "diotest" or a new name :)
> 
> And need a _require_odirect too.
> 
>> +
>> +$here/src/diotest $testfile # > $seqres.full 2>&1 ||
>> +	# _fail "i_size isn't update properly!"
> 
> All the comments can be removed :)
> 
> Thanks,
> Eryu
> 
>> +
>> +# success, all done
>> +status=0
>> +exit
>> diff --git a/tests/generic/450.out b/tests/generic/450.out
>> new file mode 100644
>> index 0000000..734761a
>> --- /dev/null
>> +++ b/tests/generic/450.out
>> @@ -0,0 +1 @@
>> +QA output created by 450
>> diff --git a/tests/generic/group b/tests/generic/group
>> index b9cd0e8..a555fa0 100644
>> --- a/tests/generic/group
>> +++ b/tests/generic/group
>> @@ -452,3 +452,4 @@
>>  447 auto quick clone
>>  448 auto quick rw
>>  449 auto quick acl enospc
>> +450 auto rw quick
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe fstests" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-08-18 20:35 [PATCH] test race when checking i_size on direct i/o read Eric Sandeen
  2017-08-28 10:35 ` Eryu Guan
@ 2017-09-19  7:36 ` Eryu Guan
  2017-09-19 14:13   ` Brian Foster
  1 sibling, 1 reply; 10+ messages in thread
From: Eryu Guan @ 2017-09-19  7:36 UTC (permalink / raw)
  To: Eric Sandeen; +Cc: fstests, Zheng Liu, Christoph Hellwig, linux-xfs

On Fri, Aug 18, 2017 at 03:35:02PM -0500, Eric Sandeen wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
> 
> In this commit a new test case is added to test that i_size races don't
> occur under dio reads/writes.  We add a program in /src dir, which
> has a writer to issue some append dio writes.  Meanwhile it has a reader
> in this test to do some dio reads.  As we expect, reader should read
> nothing or data with 'a'.  But it might read some data with '0'.
> 
> The bug can be reproduced by this test case [1].
> 
> 1.  http://patchwork.ozlabs.org/patch/311761/
> 
> This ostensibly tests commit:
> 9fe55eea7 Fix race when checking i_size on direct i/o read
> 
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: Rich Johnston <rjohnston@sgi.com>
> Cc: Dave Chinner <david@fromorbit.com>
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> [sandeen: update to recent xfstests, update commitlog]
> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> ---
> 
> This test was originally titled:
> 
>  xfstests: add a new test case to test i_size updated properly under dio
> 
> but I think the issue is more of when and how it's tested, not how
> it's updated.
> 
> Note, this passes on xfs on 4.10, but fails on 4.12.
> ext4 on 4.10 passes as well but is very slow.
> 
> iomap dio maybe?  Not sure yet.

My test with 4.10 kernel suggested that test still failed there. And I
digged into this test a bit, and found that it was commit d531d91d6990
("xfs: always use unwritten extents for direct I/O writes") introduced
this failure, which is in v3.14 kernel.

This is because we start allocating unwritten extents for direct writes
that can extend i_size, but in xfs_dio_write_end_io() we update in-core
i_size before converting unwritten extents to real allocations. So a
racing direct read could find the not-yet converted unwritten extents
and read zeros instead of actual data.

But I'm not sure what's the best way to fix it. I think taking exclusive
iolock instead of shared lock for direct writes that can extend i_size
could fix the non-aio dio write case, but aio-dio write still fails,
because in the aio-dio write case we defer end_io to a workqueue, which
doesn't take any iolock at all..

ext4 has no such problem because ext4 converts unwritten extents before
updating i_size, and ext4 doesn't support appending aio dio writes.

(Keep the rest of patch untrimmed for reference, as I added linux-xfs to
cc list.)

Thanks,
Eryu

> 
> changelog v3:
>  * rebase against latest xfstests/master branch
>  * update commit log
> 
> changelog v2:
>  * add '-lpthread' into LLDLIBS
> 
> diff --git a/configure.ac b/configure.ac
> index 57092f1..4663004 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -59,6 +59,7 @@ AC_PACKAGE_NEED_GETXATTR_LIBATTR
>  AC_PACKAGE_NEED_SYS_ACL_H
>  AC_PACKAGE_NEED_ACL_LIBACL_H
>  AC_PACKAGE_NEED_ACLINIT_LIBACL
> +AC_PACKAGE_NEED_PTHREADMUTEXINIT
>  
>  AC_PACKAGE_WANT_GDBM
>  AC_PACKAGE_WANT_AIO
> diff --git a/include/builddefs.in b/include/builddefs.in
> index cb52b99..fcc8b90 100644
> --- a/include/builddefs.in
> +++ b/include/builddefs.in
> @@ -25,6 +25,7 @@ LIBGDBM = @libgdbm@
>  LIBUUID = @libuuid@
>  LIBHANDLE = @libhdl@
>  LIBDM = @libdm@
> +LIBPTHREAD = @libpthread@
>  LIBTEST = $(TOPDIR)/lib/libtest.la
>  prefix = @prefix@
>  
> diff --git a/src/Makefile b/src/Makefile
> index b8aff49..e9419bd 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -23,7 +23,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
>  	seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
>  	renameat2 t_getcwd e4compact test-nextquota punch-alternating \
>  	attr-list-by-handle-cursor-test listxattr dio-interleaved t_dir_type \
> -	dio-invalidate-cache stat_test t_encrypted_d_revalidate
> +	dio-invalidate-cache stat_test t_encrypted_d_revalidate diotest
>  
>  SUBDIRS =
>  
> diff --git a/src/diotest.c b/src/diotest.c
> new file mode 100644
> index 0000000..7d2378f
> --- /dev/null
> +++ b/src/diotest.c
> @@ -0,0 +1,166 @@
> +/*
> + * Copyright (c) 2013 Alibaba Group.
> + * All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it would be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write the Free Software Foundation,
> + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> + */
> +
> +/*
> + * This is a normal case that we do some append dio writes and meanwhile
> + * we do some dio reads.  Currently in vfs we don't ensure that i_size
> + * is updated properly.  Hence the reader will read some data with '0'.
> + * But we expect that the reader should read nothing or data with 'a'.
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#include <unistd.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <errno.h>
> +
> +#include <pthread.h>
> +
> +static char *prog;
> +
> +struct writer_data {
> +	int fd;
> +	size_t blksize;
> +	char *buf;
> +};
> +
> +static void usage(void)
> +{
> +	fprintf(stderr, "usage: %s [FILE]\n", prog);
> +}
> +
> +static void *writer(void *arg)
> +{
> +	struct writer_data *data = (struct writer_data *)arg;
> +	int ret;
> +
> +	ret = write(data->fd, data->buf, data->blksize);
> +	if (ret < 0)
> +		fprintf(stderr, "write file failed: %s\n", strerror(errno));
> +
> +	return NULL;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	pthread_t tid;
> +	struct writer_data wdata;
> +	size_t max_blocks = 128;		/* 128 */
> +	size_t blksize = 1 * 1024 * 1024;	/* 1M */
> +	char *rbuf = NULL, *wbuf = NULL;
> +	int rfd = 0, wfd = 0;
> +	int i, j;
> +	int ret = 0;
> +
> +	prog = basename(argv[0]);
> +
> +	if (argc != 2) {
> +		usage();
> +		exit(1);
> +	}
> +
> +	wfd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
> +	if (wfd < 0) {
> +		fprintf(stderr, "failed to open write file: %s\n",
> +			strerror(errno));
> +		exit(1);
> +	}
> +
> +	rfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
> +	if (wfd < 0) {
> +		fprintf(stderr, "failed to open read file: %s\n",
> +			strerror(errno));
> +		ret = 1;
> +		goto err;
> +	}
> +
> +	/*
> +	 * We set 1024 as an alignment size for write buf.  Feel free to change
> +	 * it with 4096.  But the problem is also hitted.
> +	 */
> +	if (posix_memalign((void **)&wbuf, 1024, blksize)) {
> +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
> +		ret = 1;
> +		goto err;
> +	}
> +
> +	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
> +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
> +		ret = 1;
> +		goto err;
> +	}
> +
> +	memset(wbuf, 'a', blksize);
> +	wdata.fd = wfd;
> +	wdata.blksize = blksize;
> +	wdata.buf = wbuf;
> +
> +	for (i = 0; i < max_blocks; i++) {
> +		void *retval;
> +
> +		if (pthread_create(&tid, NULL, writer, &wdata)) {
> +			fprintf(stderr, "create thread failed: %s\n",
> +				strerror(errno));
> +			ret = 1;
> +			goto err;
> +		}
> +
> +		memset(rbuf, 'b', blksize);
> +		do {
> +			ret = pread(rfd, rbuf, blksize, i * blksize);
> +			if (ret < 0)
> +				fprintf(stderr, "read file failed: %s\n",
> +					strerror(errno));
> +		} while (ret <= 0);
> +
> +		if (pthread_join(tid, &retval)) {
> +			fprintf(stderr, " pthread join failed: %s\n",
> +				strerror(errno));
> +			ret = 1;
> +			goto err;
> +		}
> +
> +		if (ret >= 0) {
> +			for (j = 0; j < ret; j ++) {
> +				if (rbuf[j] != 'a') {
> +					fprintf(stderr, "encounter an error: "
> +						"offset %d content %c\n",
> +						i, rbuf[j]);
> +					ret = 1;
> +					goto err;
> +				}
> +			}
> +		}
> +	}
> +
> +err:
> +	if (rfd)
> +		close(rfd);
> +	if (wfd)
> +		close(wfd);
> +	if (rbuf)
> +		free(rbuf);
> +	if (wbuf)
> +		free(wbuf);
> +
> +	return ret;
> +}
> diff --git a/tests/generic/450 b/tests/generic/450
> new file mode 100755
> index 0000000..cfb424c
> --- /dev/null
> +++ b/tests/generic/450
> @@ -0,0 +1,56 @@
> +#! /bin/bash
> +# FS QA Test No. 450
> +#
> +# Test i_size is updated properly under dio read/write
> +#
> +#-----------------------------------------------------------------------
> +# Copyright (c) 2013 Alibaba Group.  All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#-----------------------------------------------------------------------
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1	# failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +    cd /
> +    rm -f $tmp.* $testfile
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +
> +# real QA test starts here
> +_supported_fs generic
> +_supported_os Linux
> +
> +testfile=$TEST_DIR/$seq.$$
> +
> +[ -x $here/src/diotest ] || _notrun "diotest not built"
> +
> +$here/src/diotest $testfile # > $seqres.full 2>&1 ||
> +	# _fail "i_size isn't update properly!"
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/450.out b/tests/generic/450.out
> new file mode 100644
> index 0000000..734761a
> --- /dev/null
> +++ b/tests/generic/450.out
> @@ -0,0 +1 @@
> +QA output created by 450
> diff --git a/tests/generic/group b/tests/generic/group
> index b9cd0e8..a555fa0 100644
> --- a/tests/generic/group
> +++ b/tests/generic/group
> @@ -452,3 +452,4 @@
>  447 auto quick clone
>  448 auto quick rw
>  449 auto quick acl enospc
> +450 auto rw quick
> 
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-09-19  7:36 ` Eryu Guan
@ 2017-09-19 14:13   ` Brian Foster
  2017-09-19 14:34     ` Christoph Hellwig
  0 siblings, 1 reply; 10+ messages in thread
From: Brian Foster @ 2017-09-19 14:13 UTC (permalink / raw)
  To: Eryu Guan; +Cc: Eric Sandeen, fstests, Zheng Liu, Christoph Hellwig, linux-xfs

On Tue, Sep 19, 2017 at 03:36:16PM +0800, Eryu Guan wrote:
> On Fri, Aug 18, 2017 at 03:35:02PM -0500, Eric Sandeen wrote:
> > From: Zheng Liu <wenqing.lz@taobao.com>
> > 
> > In this commit a new test case is added to test that i_size races don't
> > occur under dio reads/writes.  We add a program in /src dir, which
> > has a writer to issue some append dio writes.  Meanwhile it has a reader
> > in this test to do some dio reads.  As we expect, reader should read
> > nothing or data with 'a'.  But it might read some data with '0'.
> > 
> > The bug can be reproduced by this test case [1].
> > 
> > 1.  http://patchwork.ozlabs.org/patch/311761/
> > 
> > This ostensibly tests commit:
> > 9fe55eea7 Fix race when checking i_size on direct i/o read
> > 
> > Cc: Christoph Hellwig <hch@infradead.org>
> > Cc: Rich Johnston <rjohnston@sgi.com>
> > Cc: Dave Chinner <david@fromorbit.com>
> > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > [sandeen: update to recent xfstests, update commitlog]
> > Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> > ---
> > 
> > This test was originally titled:
> > 
> >  xfstests: add a new test case to test i_size updated properly under dio
> > 
> > but I think the issue is more of when and how it's tested, not how
> > it's updated.
> > 
> > Note, this passes on xfs on 4.10, but fails on 4.12.
> > ext4 on 4.10 passes as well but is very slow.
> > 
> > iomap dio maybe?  Not sure yet.
> 
> My test with 4.10 kernel suggested that test still failed there. And I
> digged into this test a bit, and found that it was commit d531d91d6990
> ("xfs: always use unwritten extents for direct I/O writes") introduced
> this failure, which is in v3.14 kernel.
> 
> This is because we start allocating unwritten extents for direct writes
> that can extend i_size, but in xfs_dio_write_end_io() we update in-core
> i_size before converting unwritten extents to real allocations. So a
> racing direct read could find the not-yet converted unwritten extents
> and read zeros instead of actual data.
> 
> But I'm not sure what's the best way to fix it. I think taking exclusive
> iolock instead of shared lock for direct writes that can extend i_size
> could fix the non-aio dio write case, but aio-dio write still fails,
> because in the aio-dio write case we defer end_io to a workqueue, which
> doesn't take any iolock at all..
> 

Can we pass a boolean or flag to xfs_iomap_write_unwritten() to have it
update the incore i_size after unwritten extent conversion? Then move
(or remove) the associated update from xfs_dio_write_end_io().

Brian

> ext4 has no such problem because ext4 converts unwritten extents before
> updating i_size, and ext4 doesn't support appending aio dio writes.
> 
> (Keep the rest of patch untrimmed for reference, as I added linux-xfs to
> cc list.)
> 
> Thanks,
> Eryu
> 
> > 
> > changelog v3:
> >  * rebase against latest xfstests/master branch
> >  * update commit log
> > 
> > changelog v2:
> >  * add '-lpthread' into LLDLIBS
> > 
> > diff --git a/configure.ac b/configure.ac
> > index 57092f1..4663004 100644
> > --- a/configure.ac
> > +++ b/configure.ac
> > @@ -59,6 +59,7 @@ AC_PACKAGE_NEED_GETXATTR_LIBATTR
> >  AC_PACKAGE_NEED_SYS_ACL_H
> >  AC_PACKAGE_NEED_ACL_LIBACL_H
> >  AC_PACKAGE_NEED_ACLINIT_LIBACL
> > +AC_PACKAGE_NEED_PTHREADMUTEXINIT
> >  
> >  AC_PACKAGE_WANT_GDBM
> >  AC_PACKAGE_WANT_AIO
> > diff --git a/include/builddefs.in b/include/builddefs.in
> > index cb52b99..fcc8b90 100644
> > --- a/include/builddefs.in
> > +++ b/include/builddefs.in
> > @@ -25,6 +25,7 @@ LIBGDBM = @libgdbm@
> >  LIBUUID = @libuuid@
> >  LIBHANDLE = @libhdl@
> >  LIBDM = @libdm@
> > +LIBPTHREAD = @libpthread@
> >  LIBTEST = $(TOPDIR)/lib/libtest.la
> >  prefix = @prefix@
> >  
> > diff --git a/src/Makefile b/src/Makefile
> > index b8aff49..e9419bd 100644
> > --- a/src/Makefile
> > +++ b/src/Makefile
> > @@ -23,7 +23,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
> >  	seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \
> >  	renameat2 t_getcwd e4compact test-nextquota punch-alternating \
> >  	attr-list-by-handle-cursor-test listxattr dio-interleaved t_dir_type \
> > -	dio-invalidate-cache stat_test t_encrypted_d_revalidate
> > +	dio-invalidate-cache stat_test t_encrypted_d_revalidate diotest
> >  
> >  SUBDIRS =
> >  
> > diff --git a/src/diotest.c b/src/diotest.c
> > new file mode 100644
> > index 0000000..7d2378f
> > --- /dev/null
> > +++ b/src/diotest.c
> > @@ -0,0 +1,166 @@
> > +/*
> > + * Copyright (c) 2013 Alibaba Group.
> > + * All Rights Reserved.
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it would be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write the Free Software Foundation,
> > + * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> > + */
> > +
> > +/*
> > + * This is a normal case that we do some append dio writes and meanwhile
> > + * we do some dio reads.  Currently in vfs we don't ensure that i_size
> > + * is updated properly.  Hence the reader will read some data with '0'.
> > + * But we expect that the reader should read nothing or data with 'a'.
> > + */
> > +
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +
> > +#include <unistd.h>
> > +#include <sys/types.h>
> > +#include <sys/stat.h>
> > +#include <fcntl.h>
> > +#include <errno.h>
> > +
> > +#include <pthread.h>
> > +
> > +static char *prog;
> > +
> > +struct writer_data {
> > +	int fd;
> > +	size_t blksize;
> > +	char *buf;
> > +};
> > +
> > +static void usage(void)
> > +{
> > +	fprintf(stderr, "usage: %s [FILE]\n", prog);
> > +}
> > +
> > +static void *writer(void *arg)
> > +{
> > +	struct writer_data *data = (struct writer_data *)arg;
> > +	int ret;
> > +
> > +	ret = write(data->fd, data->buf, data->blksize);
> > +	if (ret < 0)
> > +		fprintf(stderr, "write file failed: %s\n", strerror(errno));
> > +
> > +	return NULL;
> > +}
> > +
> > +int main(int argc, char *argv[])
> > +{
> > +	pthread_t tid;
> > +	struct writer_data wdata;
> > +	size_t max_blocks = 128;		/* 128 */
> > +	size_t blksize = 1 * 1024 * 1024;	/* 1M */
> > +	char *rbuf = NULL, *wbuf = NULL;
> > +	int rfd = 0, wfd = 0;
> > +	int i, j;
> > +	int ret = 0;
> > +
> > +	prog = basename(argv[0]);
> > +
> > +	if (argc != 2) {
> > +		usage();
> > +		exit(1);
> > +	}
> > +
> > +	wfd = open(argv[1], O_CREAT|O_DIRECT|O_WRONLY|O_APPEND|O_TRUNC, S_IRWXU);
> > +	if (wfd < 0) {
> > +		fprintf(stderr, "failed to open write file: %s\n",
> > +			strerror(errno));
> > +		exit(1);
> > +	}
> > +
> > +	rfd = open(argv[1], O_DIRECT|O_RDONLY, S_IRWXU);
> > +	if (wfd < 0) {
> > +		fprintf(stderr, "failed to open read file: %s\n",
> > +			strerror(errno));
> > +		ret = 1;
> > +		goto err;
> > +	}
> > +
> > +	/*
> > +	 * We set 1024 as an alignment size for write buf.  Feel free to change
> > +	 * it with 4096.  But the problem is also hitted.
> > +	 */
> > +	if (posix_memalign((void **)&wbuf, 1024, blksize)) {
> > +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
> > +		ret = 1;
> > +		goto err;
> > +	}
> > +
> > +	if (posix_memalign((void **)&rbuf, 4096, blksize)) {
> > +		fprintf(stderr, "failed to alloc memory: %s\n", strerror(errno));
> > +		ret = 1;
> > +		goto err;
> > +	}
> > +
> > +	memset(wbuf, 'a', blksize);
> > +	wdata.fd = wfd;
> > +	wdata.blksize = blksize;
> > +	wdata.buf = wbuf;
> > +
> > +	for (i = 0; i < max_blocks; i++) {
> > +		void *retval;
> > +
> > +		if (pthread_create(&tid, NULL, writer, &wdata)) {
> > +			fprintf(stderr, "create thread failed: %s\n",
> > +				strerror(errno));
> > +			ret = 1;
> > +			goto err;
> > +		}
> > +
> > +		memset(rbuf, 'b', blksize);
> > +		do {
> > +			ret = pread(rfd, rbuf, blksize, i * blksize);
> > +			if (ret < 0)
> > +				fprintf(stderr, "read file failed: %s\n",
> > +					strerror(errno));
> > +		} while (ret <= 0);
> > +
> > +		if (pthread_join(tid, &retval)) {
> > +			fprintf(stderr, " pthread join failed: %s\n",
> > +				strerror(errno));
> > +			ret = 1;
> > +			goto err;
> > +		}
> > +
> > +		if (ret >= 0) {
> > +			for (j = 0; j < ret; j ++) {
> > +				if (rbuf[j] != 'a') {
> > +					fprintf(stderr, "encounter an error: "
> > +						"offset %d content %c\n",
> > +						i, rbuf[j]);
> > +					ret = 1;
> > +					goto err;
> > +				}
> > +			}
> > +		}
> > +	}
> > +
> > +err:
> > +	if (rfd)
> > +		close(rfd);
> > +	if (wfd)
> > +		close(wfd);
> > +	if (rbuf)
> > +		free(rbuf);
> > +	if (wbuf)
> > +		free(wbuf);
> > +
> > +	return ret;
> > +}
> > diff --git a/tests/generic/450 b/tests/generic/450
> > new file mode 100755
> > index 0000000..cfb424c
> > --- /dev/null
> > +++ b/tests/generic/450
> > @@ -0,0 +1,56 @@
> > +#! /bin/bash
> > +# FS QA Test No. 450
> > +#
> > +# Test i_size is updated properly under dio read/write
> > +#
> > +#-----------------------------------------------------------------------
> > +# Copyright (c) 2013 Alibaba Group.  All Rights Reserved.
> > +#
> > +# This program is free software; you can redistribute it and/or
> > +# modify it under the terms of the GNU General Public License as
> > +# published by the Free Software Foundation.
> > +#
> > +# This program is distributed in the hope that it would be useful,
> > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +# GNU General Public License for more details.
> > +#
> > +# You should have received a copy of the GNU General Public License
> > +# along with this program; if not, write the Free Software Foundation,
> > +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> > +#-----------------------------------------------------------------------
> > +#
> > +
> > +seq=`basename $0`
> > +seqres=$RESULT_DIR/$seq
> > +echo "QA output created by $seq"
> > +
> > +here=`pwd`
> > +tmp=/tmp/$$
> > +status=1	# failure is the default!
> > +trap "_cleanup; exit \$status" 0 1 2 3 15
> > +
> > +_cleanup()
> > +{
> > +    cd /
> > +    rm -f $tmp.* $testfile
> > +}
> > +
> > +# get standard environment, filters and checks
> > +. ./common/rc
> > +. ./common/filter
> > +
> > +# real QA test starts here
> > +_supported_fs generic
> > +_supported_os Linux
> > +
> > +testfile=$TEST_DIR/$seq.$$
> > +
> > +[ -x $here/src/diotest ] || _notrun "diotest not built"
> > +
> > +$here/src/diotest $testfile # > $seqres.full 2>&1 ||
> > +	# _fail "i_size isn't update properly!"
> > +
> > +# success, all done
> > +status=0
> > +exit
> > diff --git a/tests/generic/450.out b/tests/generic/450.out
> > new file mode 100644
> > index 0000000..734761a
> > --- /dev/null
> > +++ b/tests/generic/450.out
> > @@ -0,0 +1 @@
> > +QA output created by 450
> > diff --git a/tests/generic/group b/tests/generic/group
> > index b9cd0e8..a555fa0 100644
> > --- a/tests/generic/group
> > +++ b/tests/generic/group
> > @@ -452,3 +452,4 @@
> >  447 auto quick clone
> >  448 auto quick rw
> >  449 auto quick acl enospc
> > +450 auto rw quick
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe fstests" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-09-19 14:13   ` Brian Foster
@ 2017-09-19 14:34     ` Christoph Hellwig
  2017-09-19 14:58       ` Brian Foster
  2017-09-20 11:05       ` Eryu Guan
  0 siblings, 2 replies; 10+ messages in thread
From: Christoph Hellwig @ 2017-09-19 14:34 UTC (permalink / raw)
  To: Brian Foster
  Cc: Eryu Guan, Eric Sandeen, fstests, Zheng Liu, Christoph Hellwig,
	linux-xfs

On Tue, Sep 19, 2017 at 10:13:52AM -0400, Brian Foster wrote:
> Can we pass a boolean or flag to xfs_iomap_write_unwritten() to have it
> update the incore i_size after unwritten extent conversion? Then move
> (or remove) the associated update from xfs_dio_write_end_io().

I don't think we even need a flag - all three callers of
xfs_iomap_write_unwritten want to update the file size.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-09-19 14:34     ` Christoph Hellwig
@ 2017-09-19 14:58       ` Brian Foster
  2017-09-20 11:05       ` Eryu Guan
  1 sibling, 0 replies; 10+ messages in thread
From: Brian Foster @ 2017-09-19 14:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Eryu Guan, Eric Sandeen, fstests, Zheng Liu, linux-xfs

On Tue, Sep 19, 2017 at 07:34:06AM -0700, Christoph Hellwig wrote:
> On Tue, Sep 19, 2017 at 10:13:52AM -0400, Brian Foster wrote:
> > Can we pass a boolean or flag to xfs_iomap_write_unwritten() to have it
> > update the incore i_size after unwritten extent conversion? Then move
> > (or remove) the associated update from xfs_dio_write_end_io().
> 
> I don't think we even need a flag - all three callers of
> xfs_iomap_write_unwritten want to update the file size.

Sounds reasonable to me. I wasn't sure if there was anything special
about the pnfs case. isize should probably already be stable in the
writeback case, so perhaps this would be fine as long as we ensure the
in-core size only increases (as we currently do for di_size).

Brian

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-09-19 14:34     ` Christoph Hellwig
  2017-09-19 14:58       ` Brian Foster
@ 2017-09-20 11:05       ` Eryu Guan
  2017-09-20 12:55         ` Brian Foster
  1 sibling, 1 reply; 10+ messages in thread
From: Eryu Guan @ 2017-09-20 11:05 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Brian Foster, Eric Sandeen, fstests, Zheng Liu, linux-xfs

On Tue, Sep 19, 2017 at 07:34:06AM -0700, Christoph Hellwig wrote:
> On Tue, Sep 19, 2017 at 10:13:52AM -0400, Brian Foster wrote:
> > Can we pass a boolean or flag to xfs_iomap_write_unwritten() to have it
> > update the incore i_size after unwritten extent conversion? Then move
> > (or remove) the associated update from xfs_dio_write_end_io().
> 
> I don't think we even need a flag - all three callers of
> xfs_iomap_write_unwritten want to update the file size.

I tried this approach, but seems there's some problem in the buffered
aio path, generic/112 (aio fsx) failed quickly. But I haven't digged
into the reason (maybe I screwed it up, not the method is wrong..).

Then I tried Brian's suggestion, pass a boolean to
xfs_iomap_write_unwritten() to tell if we want it to update in-core
isize after unwritten extent conversion, and skip the in-core isize
update in xfs_dio_write_end_io() accordingly. This approach seems to
work, it passed the test Eric posted here, and fstests 'aio' group
tests, a run of 'quick' group didn't find any new failure as well.

I attached the WIP patch (without proper comments) I was testing, if
this looks fine I can format a formal patch and do more testings.

Thanks,
Eryu

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 29172609f2a3..288da47e9ac5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -343,7 +343,7 @@ xfs_end_io(
 		error = xfs_reflink_end_cow(ip, offset, size);
 		break;
 	case XFS_IO_UNWRITTEN:
-		error = xfs_iomap_write_unwritten(ip, offset, size);
+		error = xfs_iomap_write_unwritten(ip, offset, size, false);
 		break;
 	default:
 		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 350b6d43ba23..f3ad024573e7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -435,6 +435,7 @@ xfs_dio_write_end_io(
 	struct xfs_inode	*ip = XFS_I(inode);
 	loff_t			offset = iocb->ki_pos;
 	bool			update_size = false;
+	bool			write_unwritten = (flags & IOMAP_DIO_UNWRITTEN);
 	int			error = 0;
 
 	trace_xfs_end_io_direct_write(ip, offset, size);
@@ -458,7 +459,8 @@ xfs_dio_write_end_io(
 	 */
 	spin_lock(&ip->i_flags_lock);
 	if (offset + size > i_size_read(inode)) {
-		i_size_write(inode, offset + size);
+		if (!write_unwritten)
+			i_size_write(inode, offset + size);
 		update_size = true;
 	}
 	spin_unlock(&ip->i_flags_lock);
@@ -469,8 +471,8 @@ xfs_dio_write_end_io(
 			return error;
 	}
 
-	if (flags & IOMAP_DIO_UNWRITTEN)
-		error = xfs_iomap_write_unwritten(ip, offset, size);
+	if (write_unwritten)
+		error = xfs_iomap_write_unwritten(ip, offset, size, update_size);
 	else if (update_size)
 		error = xfs_setfilesize(ip, offset, size);
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a1909bc064e9..0a088586371e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -829,7 +829,8 @@ int
 xfs_iomap_write_unwritten(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
-	xfs_off_t	count)
+	xfs_off_t	count,
+	bool		update_size)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	offset_fsb;
@@ -840,6 +841,7 @@ xfs_iomap_write_unwritten(
 	xfs_trans_t	*tp;
 	xfs_bmbt_irec_t imap;
 	struct xfs_defer_ops dfops;
+	struct inode	*inode = VFS_I(ip);
 	xfs_fsize_t	i_size;
 	uint		resblks;
 	int		error;
@@ -900,6 +902,13 @@ xfs_iomap_write_unwritten(
 		if (i_size > offset + count)
 			i_size = offset + count;
 
+		if (update_size) {
+			spin_lock(&ip->i_flags_lock);
+			if (i_size > i_size_read(inode))
+				i_size_write(inode, i_size);
+			spin_unlock(&ip->i_flags_lock);
+		}
+
 		i_size = xfs_new_eof(ip, i_size);
 		if (i_size) {
 			ip->i_d.di_size = i_size;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 00db3ecea084..ee535065c5d0 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
 int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
 			struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
 
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 2f2dc3c09ad0..4246876df7b7 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -274,7 +274,7 @@ xfs_fs_commit_blocks(
 					(end - 1) >> PAGE_SHIFT);
 		WARN_ON_ONCE(error);
 
-		error = xfs_iomap_write_unwritten(ip, start, length);
+		error = xfs_iomap_write_unwritten(ip, start, length, false);
 		if (error)
 			goto out_drop_iolock;
 	}

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-09-20 11:05       ` Eryu Guan
@ 2017-09-20 12:55         ` Brian Foster
  2017-09-21 10:09           ` Eryu Guan
  0 siblings, 1 reply; 10+ messages in thread
From: Brian Foster @ 2017-09-20 12:55 UTC (permalink / raw)
  To: Eryu Guan; +Cc: Christoph Hellwig, Eric Sandeen, fstests, Zheng Liu, linux-xfs

On Wed, Sep 20, 2017 at 07:05:04PM +0800, Eryu Guan wrote:
> On Tue, Sep 19, 2017 at 07:34:06AM -0700, Christoph Hellwig wrote:
> > On Tue, Sep 19, 2017 at 10:13:52AM -0400, Brian Foster wrote:
> > > Can we pass a boolean or flag to xfs_iomap_write_unwritten() to have it
> > > update the incore i_size after unwritten extent conversion? Then move
> > > (or remove) the associated update from xfs_dio_write_end_io().
> > 
> > I don't think we even need a flag - all three callers of
> > xfs_iomap_write_unwritten want to update the file size.
> 
> I tried this approach, but seems there's some problem in the buffered
> aio path, generic/112 (aio fsx) failed quickly. But I haven't digged
> into the reason (maybe I screwed it up, not the method is wrong..).
> 

>From the generic/112 results:

Size error: expected 0x3785b stat 0x38000 seek 0x38000

I suspect the problem is that the offset+size from buffered I/O
completion is not based on the inode size. Rather, it is buffer head
granularity size of the ioend. Given that, it probably does make sense
to skip the update from this path.

> Then I tried Brian's suggestion, pass a boolean to
> xfs_iomap_write_unwritten() to tell if we want it to update in-core
> isize after unwritten extent conversion, and skip the in-core isize
> update in xfs_dio_write_end_io() accordingly. This approach seems to
> work, it passed the test Eric posted here, and fstests 'aio' group
> tests, a run of 'quick' group didn't find any new failure as well.
> 
> I attached the WIP patch (without proper comments) I was testing, if
> this looks fine I can format a formal patch and do more testings.
>

Thanks. This mostly looks reasonable to me...
 
> Thanks,
> Eryu
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 29172609f2a3..288da47e9ac5 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -343,7 +343,7 @@ xfs_end_io(
>  		error = xfs_reflink_end_cow(ip, offset, size);
>  		break;
>  	case XFS_IO_UNWRITTEN:
> -		error = xfs_iomap_write_unwritten(ip, offset, size);
> +		error = xfs_iomap_write_unwritten(ip, offset, size, false);

Maybe add a single line comment here wrt to the above (why we don't
update isize).

>  		break;
>  	default:
>  		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 350b6d43ba23..f3ad024573e7 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -435,6 +435,7 @@ xfs_dio_write_end_io(
>  	struct xfs_inode	*ip = XFS_I(inode);
>  	loff_t			offset = iocb->ki_pos;
>  	bool			update_size = false;
> +	bool			write_unwritten = (flags & IOMAP_DIO_UNWRITTEN);
>  	int			error = 0;
>  
>  	trace_xfs_end_io_direct_write(ip, offset, size);
> @@ -458,7 +459,8 @@ xfs_dio_write_end_io(
>  	 */
>  	spin_lock(&ip->i_flags_lock);
>  	if (offset + size > i_size_read(inode)) {
> -		i_size_write(inode, offset + size);
> +		if (!write_unwritten)
> +			i_size_write(inode, offset + size);
>  		update_size = true;

I find the logic a little confusing here. For !write_unwritten,
update_size means to update the on-disk size. Otherwise, it instructs
iomap_write_unwritten() to also update the in-core size. The latter also
checks for appends, however, so it might as well always be true from
here. It seems that for such a small function, we should be able to make
this a bit easier to follow. ;P

Should we ever see an isize update at all on a IOMAP_DIO_COW completion
(wouldn't reflinked blocks have to be within eof)? If not, then we can
presumably rule out isize updates in that case. I think that just leaves
the case where a dio write occurs on a pre-existing block. Hmm, could we
just move this whole hunk down to after the iomap_write_unwritten() call
and eliminate the need for the flag entirely? E.g., something like:

	...
	if (IOMAP_DIO_COW) {
		...
	}

	/* unwritten conversion updates isize */
	if (IOMAP_DIO_UNWRITTEN)
		return xfs_iomap_write_unwritten(ip, offset, size, true);

	if (offset + size > i_size_read(inode)) {
		i_size_write(inode, offset + size);
		error = xfs_setfilesize(...);
	}

	return error;

>  	}
>  	spin_unlock(&ip->i_flags_lock);
> @@ -469,8 +471,8 @@ xfs_dio_write_end_io(
>  			return error;
>  	}
>  
> -	if (flags & IOMAP_DIO_UNWRITTEN)
> -		error = xfs_iomap_write_unwritten(ip, offset, size);
> +	if (write_unwritten)
> +		error = xfs_iomap_write_unwritten(ip, offset, size, update_size);
>  	else if (update_size)
>  		error = xfs_setfilesize(ip, offset, size);
>  
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index a1909bc064e9..0a088586371e 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -829,7 +829,8 @@ int
>  xfs_iomap_write_unwritten(
>  	xfs_inode_t	*ip,
>  	xfs_off_t	offset,
> -	xfs_off_t	count)
> +	xfs_off_t	count,
> +	bool		update_size)
>  {
>  	xfs_mount_t	*mp = ip->i_mount;
>  	xfs_fileoff_t	offset_fsb;
> @@ -840,6 +841,7 @@ xfs_iomap_write_unwritten(
>  	xfs_trans_t	*tp;
>  	xfs_bmbt_irec_t imap;
>  	struct xfs_defer_ops dfops;
> +	struct inode	*inode = VFS_I(ip);
>  	xfs_fsize_t	i_size;
>  	uint		resblks;
>  	int		error;
> @@ -900,6 +902,13 @@ xfs_iomap_write_unwritten(
>  		if (i_size > offset + count)
>  			i_size = offset + count;
>  
> +		if (update_size) {
> +			spin_lock(&ip->i_flags_lock);
> +			if (i_size > i_size_read(inode))
> +				i_size_write(inode, i_size);
> +			spin_unlock(&ip->i_flags_lock);

We have XFS_ILOCK_EXCL here so I don't think the spinlocks are
necessary any longer. That means this could probably be condensed to
something like

		if (update_size && i_size > i_size_read(inode))
			i_size_write(inode, i_size)

> +		}
> +
>  		i_size = xfs_new_eof(ip, i_size);
>  		if (i_size) {
>  			ip->i_d.di_size = i_size;
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 00db3ecea084..ee535065c5d0 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
>  			struct xfs_bmbt_irec *, int);
>  int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
>  			struct xfs_bmbt_irec *);
> -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
> +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
>  
>  void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
>  		struct xfs_bmbt_irec *);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 2f2dc3c09ad0..4246876df7b7 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -274,7 +274,7 @@ xfs_fs_commit_blocks(
>  					(end - 1) >> PAGE_SHIFT);
>  		WARN_ON_ONCE(error);
>  
> -		error = xfs_iomap_write_unwritten(ip, start, length);
> +		error = xfs_iomap_write_unwritten(ip, start, length, false);

Note that this path does update isize. It runs another transaction to
get around the fact that write_unwritten() wouldn't log a new on disk
size. There is some validation thing here though, so we might need to
check whether it is Ok to run that earlier and whether it should
continue to return an error on failure. Christoph?

That said, maybe a follow on patch would be better since this one might
be stable fodder.

Brian

>  		if (error)
>  			goto out_drop_iolock;
>  	}
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] test race when checking i_size on direct i/o read
  2017-09-20 12:55         ` Brian Foster
@ 2017-09-21 10:09           ` Eryu Guan
  0 siblings, 0 replies; 10+ messages in thread
From: Eryu Guan @ 2017-09-21 10:09 UTC (permalink / raw)
  To: Brian Foster
  Cc: Christoph Hellwig, Eric Sandeen, fstests, Zheng Liu, linux-xfs

On Wed, Sep 20, 2017 at 08:55:30AM -0400, Brian Foster wrote:
> On Wed, Sep 20, 2017 at 07:05:04PM +0800, Eryu Guan wrote:
> > On Tue, Sep 19, 2017 at 07:34:06AM -0700, Christoph Hellwig wrote:
> > > On Tue, Sep 19, 2017 at 10:13:52AM -0400, Brian Foster wrote:
> > > > Can we pass a boolean or flag to xfs_iomap_write_unwritten() to have it
> > > > update the incore i_size after unwritten extent conversion? Then move
> > > > (or remove) the associated update from xfs_dio_write_end_io().
> > > 
> > > I don't think we even need a flag - all three callers of
> > > xfs_iomap_write_unwritten want to update the file size.
> > 
> > I tried this approach, but seems there's some problem in the buffered
> > aio path, generic/112 (aio fsx) failed quickly. But I haven't digged
> > into the reason (maybe I screwed it up, not the method is wrong..).
> > 
> 
> From the generic/112 results:
> 
> Size error: expected 0x3785b stat 0x38000 seek 0x38000
> 
> I suspect the problem is that the offset+size from buffered I/O
> completion is not based on the inode size. Rather, it is buffer head
> granularity size of the ioend. Given that, it probably does make sense
> to skip the update from this path.

Yeah, you're right.

xfs_io -fc "falloc -k 0 4k" -c "pwrite 0 2k" -c fsync /mnt/xfs/testfile

This results in 4k file size (block size is also 4k).

> 
> > Then I tried Brian's suggestion, pass a boolean to
> > xfs_iomap_write_unwritten() to tell if we want it to update in-core
> > isize after unwritten extent conversion, and skip the in-core isize
> > update in xfs_dio_write_end_io() accordingly. This approach seems to
> > work, it passed the test Eric posted here, and fstests 'aio' group
> > tests, a run of 'quick' group didn't find any new failure as well.
> > 
> > I attached the WIP patch (without proper comments) I was testing, if
> > this looks fine I can format a formal patch and do more testings.
> >
> 
> Thanks. This mostly looks reasonable to me...
>  
> > Thanks,
> > Eryu
> > 
> > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> > index 29172609f2a3..288da47e9ac5 100644
> > --- a/fs/xfs/xfs_aops.c
> > +++ b/fs/xfs/xfs_aops.c
> > @@ -343,7 +343,7 @@ xfs_end_io(
> >  		error = xfs_reflink_end_cow(ip, offset, size);
> >  		break;
> >  	case XFS_IO_UNWRITTEN:
> > -		error = xfs_iomap_write_unwritten(ip, offset, size);
> > +		error = xfs_iomap_write_unwritten(ip, offset, size, false);
> 
> Maybe add a single line comment here wrt to the above (why we don't
> update isize).

Will do.

> 
> >  		break;
> >  	default:
> >  		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 350b6d43ba23..f3ad024573e7 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -435,6 +435,7 @@ xfs_dio_write_end_io(
> >  	struct xfs_inode	*ip = XFS_I(inode);
> >  	loff_t			offset = iocb->ki_pos;
> >  	bool			update_size = false;
> > +	bool			write_unwritten = (flags & IOMAP_DIO_UNWRITTEN);
> >  	int			error = 0;
> >  
> >  	trace_xfs_end_io_direct_write(ip, offset, size);
> > @@ -458,7 +459,8 @@ xfs_dio_write_end_io(
> >  	 */
> >  	spin_lock(&ip->i_flags_lock);
> >  	if (offset + size > i_size_read(inode)) {
> > -		i_size_write(inode, offset + size);
> > +		if (!write_unwritten)
> > +			i_size_write(inode, offset + size);
> >  		update_size = true;
> 
> I find the logic a little confusing here. For !write_unwritten,
> update_size means to update the on-disk size. Otherwise, it instructs
> iomap_write_unwritten() to also update the in-core size. The latter also
> checks for appends, however, so it might as well always be true from
> here. It seems that for such a small function, we should be able to make
> this a bit easier to follow. ;P

Agreed, it looks confusing when update_size serves as indicators of both
in-core and on-disk size update. And we can pass 'true' unconditionally
to xfs_iomap_write_unwritten() in this case.

> 
> Should we ever see an isize update at all on a IOMAP_DIO_COW completion
> (wouldn't reflinked blocks have to be within eof)? If not, then we can
> presumably rule out isize updates in that case. I think that just leaves
> the case where a dio write occurs on a pre-existing block. Hmm, could we
> just move this whole hunk down to after the iomap_write_unwritten() call
> and eliminate the need for the flag entirely? E.g., something like:
> 
> 	...
> 	if (IOMAP_DIO_COW) {
> 		...
> 	}
> 
> 	/* unwritten conversion updates isize */
> 	if (IOMAP_DIO_UNWRITTEN)
> 		return xfs_iomap_write_unwritten(ip, offset, size, true);
> 
> 	if (offset + size > i_size_read(inode)) {
> 		i_size_write(inode, offset + size);
> 		error = xfs_setfilesize(...);
> 	}
> 
> 	return error;

This seems fine, tests in 'clone' group all passed without new failures,
I'll update patch as you suggested. I thought about something like this
too, but I'm not familiar with the CoW path, so I decided to keep the
original logic as much as possible.

Thanks for your review and suggestions!

Eryu

> 
> >  	}
> >  	spin_unlock(&ip->i_flags_lock);
> > @@ -469,8 +471,8 @@ xfs_dio_write_end_io(
> >  			return error;
> >  	}
> >  
> > -	if (flags & IOMAP_DIO_UNWRITTEN)
> > -		error = xfs_iomap_write_unwritten(ip, offset, size);
> > +	if (write_unwritten)
> > +		error = xfs_iomap_write_unwritten(ip, offset, size, update_size);
> >  	else if (update_size)
> >  		error = xfs_setfilesize(ip, offset, size);
> >  
> > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> > index a1909bc064e9..0a088586371e 100644
> > --- a/fs/xfs/xfs_iomap.c
> > +++ b/fs/xfs/xfs_iomap.c
> > @@ -829,7 +829,8 @@ int
> >  xfs_iomap_write_unwritten(
> >  	xfs_inode_t	*ip,
> >  	xfs_off_t	offset,
> > -	xfs_off_t	count)
> > +	xfs_off_t	count,
> > +	bool		update_size)
> >  {
> >  	xfs_mount_t	*mp = ip->i_mount;
> >  	xfs_fileoff_t	offset_fsb;
> > @@ -840,6 +841,7 @@ xfs_iomap_write_unwritten(
> >  	xfs_trans_t	*tp;
> >  	xfs_bmbt_irec_t imap;
> >  	struct xfs_defer_ops dfops;
> > +	struct inode	*inode = VFS_I(ip);
> >  	xfs_fsize_t	i_size;
> >  	uint		resblks;
> >  	int		error;
> > @@ -900,6 +902,13 @@ xfs_iomap_write_unwritten(
> >  		if (i_size > offset + count)
> >  			i_size = offset + count;
> >  
> > +		if (update_size) {
> > +			spin_lock(&ip->i_flags_lock);
> > +			if (i_size > i_size_read(inode))
> > +				i_size_write(inode, i_size);
> > +			spin_unlock(&ip->i_flags_lock);
> 
> We have XFS_ILOCK_EXCL here so I don't think the spinlocks are
> necessary any longer. That means this could probably be condensed to
> something like
> 
> 		if (update_size && i_size > i_size_read(inode))
> 			i_size_write(inode, i_size)
> 
> > +		}
> > +
> >  		i_size = xfs_new_eof(ip, i_size);
> >  		if (i_size) {
> >  			ip->i_d.di_size = i_size;
> > diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> > index 00db3ecea084..ee535065c5d0 100644
> > --- a/fs/xfs/xfs_iomap.h
> > +++ b/fs/xfs/xfs_iomap.h
> > @@ -27,7 +27,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
> >  			struct xfs_bmbt_irec *, int);
> >  int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
> >  			struct xfs_bmbt_irec *);
> > -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
> > +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
> >  
> >  void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
> >  		struct xfs_bmbt_irec *);
> > diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> > index 2f2dc3c09ad0..4246876df7b7 100644
> > --- a/fs/xfs/xfs_pnfs.c
> > +++ b/fs/xfs/xfs_pnfs.c
> > @@ -274,7 +274,7 @@ xfs_fs_commit_blocks(
> >  					(end - 1) >> PAGE_SHIFT);
> >  		WARN_ON_ONCE(error);
> >  
> > -		error = xfs_iomap_write_unwritten(ip, start, length);
> > +		error = xfs_iomap_write_unwritten(ip, start, length, false);
> 
> Note that this path does update isize. It runs another transaction to
> get around the fact that write_unwritten() wouldn't log a new on disk
> size. There is some validation thing here though, so we might need to
> check whether it is Ok to run that earlier and whether it should
> continue to return an error on failure. Christoph?
> 
> That said, maybe a follow on patch would be better since this one might
> be stable fodder.
> 
> Brian
> 
> >  		if (error)
> >  			goto out_drop_iolock;
> >  	}
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-09-21 10:09 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-18 20:35 [PATCH] test race when checking i_size on direct i/o read Eric Sandeen
2017-08-28 10:35 ` Eryu Guan
2017-08-29 13:46   ` Nikolay Borisov
2017-09-19  7:36 ` Eryu Guan
2017-09-19 14:13   ` Brian Foster
2017-09-19 14:34     ` Christoph Hellwig
2017-09-19 14:58       ` Brian Foster
2017-09-20 11:05       ` Eryu Guan
2017-09-20 12:55         ` Brian Foster
2017-09-21 10:09           ` Eryu Guan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.