linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [BUG] Possible silent data corruption in filesystems/page cache
@ 2016-06-01  9:51 Barczak, Mariusz
  2016-06-02 19:32 ` Andreas Dilger
  0 siblings, 1 reply; 5+ messages in thread
From: Barczak, Mariusz @ 2016-06-01  9:51 UTC (permalink / raw)
  To: Andrew Morton, Jens Axboe, Alexander Viro, linux-mm, linux-block,
	linux-fsdevel, linux-kernel
  Cc: Wysoczanski, Michal, Baldyga, Robert, Barczak, Mariusz, Roman, Agnieszka

[-- Attachment #1: Type: text/plain, Size: 1428 bytes --]

Hi

We run data validation test for buffered workload on filesystems:
ext3, ext4, and XFS.
In context of flushing page cache block device driver returned IO error.
After dropping page cache our validation tool reported data corruption.

We provided a simple patch in order to inject IO error in device mapper.
We run test to verify md5sum of file during IO error.
Test shows checksum mismatch.

Attachments:
0001-drivers-md-dm-add-error-injection.patch - device mapper patch
dm-test.txt - validation test script

Regards,
Mariusz Barczak
Intel Technology Poland

--------------------------------------------------------------------

Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek
przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by
others is strictly prohibited.

[-- Attachment #2: 0001-drivers-md-dm-add-error-injection.patch --]
[-- Type: application/octet-stream, Size: 2351 bytes --]

From ca41f3f2ebeb2555ce18393f6ef98629b63a80db Mon Sep 17 00:00:00 2001
From: Robert Baldyga <robert.baldyga@intel.com>
Date: Wed, 1 Jun 2016 11:00:47 +0200
Subject: [DEBUG] drivers: md: dm: add error injection

This patch adds error injection mechanism using debugfs.
It allows to fail few BIO's for debug purposes.

Signed-off-by: Robert Baldyga <robert.baldyga@intel.com>
---
 drivers/md/dm.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1b2f962..15a70a2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -25,6 +25,7 @@
 #include <linux/elevator.h> /* for rq_end_sector() */
 #include <linux/blk-mq.h>
 #include <linux/pr.h>
+#include <linux/debugfs.h>
 
 #include <trace/events/block.h>
 
@@ -415,8 +416,47 @@ static void local_exit(void)
 	DMINFO("cleaned up");
 }
 
+static struct dentry *dbgfs_root;
+static u32 dbg_error_counter = 0;
+static u32 dbg_error_max = 1;
+static u32 dbg_error = 0;
+
+static int __init dm_debugfs_init(void)
+{
+	struct dentry *ret_d;
+
+	dbgfs_root = debugfs_create_dir("dm_debug", NULL);
+	if (!dbgfs_root)
+		return -ENOENT;
+
+	ret_d = debugfs_create_u32("error_counter", 0644, dbgfs_root, &dbg_error_counter);
+	if (!ret_d)
+		goto fail;
+
+	ret_d = debugfs_create_u32("error_max", 0644, dbgfs_root, &dbg_error_max);
+	if (!ret_d)
+		goto fail;
+
+	ret_d = debugfs_create_u32("error", 0644, dbgfs_root, &dbg_error);
+	if (!ret_d)
+		goto fail;
+
+	return 0;
+
+fail:
+	debugfs_remove_recursive(dbgfs_root);
+	dbgfs_root = NULL;
+	return -ENOENT;
+}
+
+static void dm_debugfs_exit(void)
+{
+	debugfs_remove_recursive(dbgfs_root);
+}
+
 static int (*_inits[])(void) __initdata = {
 	local_init,
+	dm_debugfs_init,
 	dm_target_init,
 	dm_linear_init,
 	dm_stripe_init,
@@ -428,6 +468,7 @@ static int (*_inits[])(void) __initdata = {
 
 static void (*_exits[])(void) = {
 	local_exit,
+	dm_debugfs_exit,
 	dm_target_exit,
 	dm_linear_exit,
 	dm_stripe_exit,
@@ -1781,6 +1822,14 @@ static void __split_and_process_bio(struct mapped_device *md,
 		return;
 	}
 
+	if (bio_data_dir(bio) == WRITE) {
+		if (dbg_error && !strncmp(current->comm, "kworker", 7) &&
+				(++dbg_error_counter < dbg_error_max)) {
+			bio_io_error(bio);
+			return;
+		}
+	}
+
 	ci.map = map;
 	ci.md = md;
 	ci.io = alloc_io(md);
-- 
1.9.1


[-- Attachment #3: dm-test.txt --]
[-- Type: text/plain, Size: 1553 bytes --]

#!/bin/bash

DISK=/dev/vda

FILE_SIZE=50

PART_NUM=10
PART_GB=2

prepare_test() {
	parted -s $DISK mktable gpt
	for i in `seq 1 1 $PART_NUM`; do
		parted -s -a optimal $DISK mkpart primary \
			$(( PART_GB * (i - 1) ))GiB $(( PART_GB * i ))GiB
	done

	for i in `seq 1 1 $PART_NUM`; do
		SIZE=`blockdev --getsize ${DISK}${i}`
		dmsetup create linear${i} --table \
			"0 ${SIZE} linear ${DISK}${i} 0"
	done

	sleep 2

	for i in `seq 1 1 $PART_NUM`; do
		mkfs.xfs -f /dev/dm-$(( i - 1 ))
	done

	for i in `seq 1 1 $PART_NUM`; do
		mkdir /mnt/p${i}
	done

	for i in `seq 1 1 $PART_NUM`; do
		mount /dev/dm-$(( i - 1 )) /mnt/p${i}
	done
}

cleanup_test() {
	for i in `seq 1 1 $PART_NUM`; do
		umount /mnt/p${i}
	done

	for i in `seq 1 1 $PART_NUM`; do
		rm -rf /mnt/p${i}
	done

	for i in `seq 1 1 $PART_NUM`; do
		dmsetup remove linear${i}
	done
}

inject_error() {
	while true; do
		echo "Error = 0"
		echo 0 > /sys/kernel/debug/dm_debug/error
		echo 0 > /sys/kernel/debug/dm_debug/error_counter
		echo 100 > /sys/kernel/debug/dm_debug/error_max
		sleep 1
		echo "Drop caches"
		echo 1 > /proc/sys/vm/drop_caches
		sleep 1
		echo "Error = 1"
		echo 1 > /sys/kernel/debug/dm_debug/error
		sleep 3
	done
}

inject_stop() {
	echo 0 > /sys/kernel/debug/dm_debug/error
}

run_test() {
	truncate -s 0 md5sum.log
	inject_error &
	PID=$!
	for i in `seq 1 1 $PART_NUM`; do
		dd if=/dev/urandom bs=1M count=$FILE_SIZE of=/mnt/p${i}/file
		md5sum /mnt/p${i}/file >> md5sum.log
	done
	kill $PID
	inject_stop
	md5sum -c md5sum.log
}

prepare_test
run_test
cleanup_test

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [BUG] Possible silent data corruption in filesystems/page cache
  2016-06-01  9:51 [BUG] Possible silent data corruption in filesystems/page cache Barczak, Mariusz
@ 2016-06-02 19:32 ` Andreas Dilger
  2016-06-06  7:29   ` Barczak, Mariusz
  0 siblings, 1 reply; 5+ messages in thread
From: Andreas Dilger @ 2016-06-02 19:32 UTC (permalink / raw)
  To: Barczak, Mariusz
  Cc: Andrew Morton, Jens Axboe, Alexander Viro, linux-mm, linux-block,
	linux-fsdevel, linux-kernel, Wysoczanski, Michal, Baldyga,
	Robert, Roman, Agnieszka

[-- Attachment #1: Type: text/plain, Size: 1099 bytes --]

On Jun 1, 2016, at 3:51 AM, Barczak, Mariusz <mariusz.barczak@intel.com> wrote:
> 
> We run data validation test for buffered workload on filesystems:
> ext3, ext4, and XFS.
> In context of flushing page cache block device driver returned IO error.
> After dropping page cache our validation tool reported data corruption.

Hi Mariusz,
it isn't clear what you expect to happen here?  If there is an IO error
then the data is not written to disk and cannot be correct when read.

The expected behaviour is the IO error will either be returned immediately
at write() time (this used to be more common with older filesystems), or it
will be returned when calling sync() on the file to flush cached data to disk.

> We provided a simple patch in order to inject IO error in device mapper.
> We run test to verify md5sum of file during IO error.
> Test shows checksum mismatch.
> 
> Attachments:
> 0001-drivers-md-dm-add-error-injection.patch - device mapper patch

There is already the dm-flakey module that allows injecting errors into
the IO path.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [BUG] Possible silent data corruption in filesystems/page cache
  2016-06-02 19:32 ` Andreas Dilger
@ 2016-06-06  7:29   ` Barczak, Mariusz
  2016-06-06 13:35     ` Theodore Ts'o
  0 siblings, 1 reply; 5+ messages in thread
From: Barczak, Mariusz @ 2016-06-06  7:29 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Andrew Morton, Jens Axboe, Alexander Viro, linux-mm, linux-block,
	linux-fsdevel, linux-kernel, Wysoczanski, Michal, Baldyga,
	Robert, Roman, Agnieszka

Hi, Let me elaborate problem in detail. 

For buffered IO data are copied into memory pages. For this case,
the write IO is not submitted (generally). In the background opportunistic
cleaning of dirty pages takes place and IO is generated to the
device. An IO error is observed on this path and application
is not informed about this. Summarizing flushing of dirty page fails.
And probably, this page is dropped but in fact it should not be.
So if above situation happens between application write and sync
then no error is reported. In addition after some time, when the
application reads the same LBA on which IO error occurred, old data
content is fetched.

We did own fault injector in order to do error in specific condition
described above.

Regards,
Mariusz.

-----Original Message-----
From: Andreas Dilger [mailto:adilger@dilger.ca] 
Sent: Thursday, June 2, 2016 21:32
To: Barczak, Mariusz <mariusz.barczak@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>; Jens Axboe <axboe@kernel.dk>; Alexander Viro <viro@zeniv.linux.org.uk>; linux-mm@kvack.org; linux-block@vger.kernel.org; linux-fsdevel@vger.kernel.org; linux-kernel@vger.kernel.org; Wysoczanski, Michal <michal.wysoczanski@intel.com>; Baldyga, Robert <robert.baldyga@intel.com>; Roman, Agnieszka <agnieszka.roman@intel.com>
Subject: Re: [BUG] Possible silent data corruption in filesystems/page cache

On Jun 1, 2016, at 3:51 AM, Barczak, Mariusz <mariusz.barczak@intel.com> wrote:
> 
> We run data validation test for buffered workload on filesystems:
> ext3, ext4, and XFS.
> In context of flushing page cache block device driver returned IO error.
> After dropping page cache our validation tool reported data corruption.

Hi Mariusz,
it isn't clear what you expect to happen here?  If there is an IO error then the data is not written to disk and cannot be correct when read.

The expected behaviour is the IO error will either be returned immediately at write() time (this used to be more common with older filesystems), or it will be returned when calling sync() on the file to flush cached data to disk.

> We provided a simple patch in order to inject IO error in device mapper.
> We run test to verify md5sum of file during IO error.
> Test shows checksum mismatch.
> 
> Attachments:
> 0001-drivers-md-dm-add-error-injection.patch - device mapper patch

There is already the dm-flakey module that allows injecting errors into the IO path.

Cheers, Andreas





--------------------------------------------------------------------

Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek
przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by
others is strictly prohibited.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [BUG] Possible silent data corruption in filesystems/page cache
  2016-06-06  7:29   ` Barczak, Mariusz
@ 2016-06-06 13:35     ` Theodore Ts'o
  2016-06-07  7:36       ` Barczak, Mariusz
  0 siblings, 1 reply; 5+ messages in thread
From: Theodore Ts'o @ 2016-06-06 13:35 UTC (permalink / raw)
  To: Barczak, Mariusz
  Cc: Andreas Dilger, Andrew Morton, Jens Axboe, Alexander Viro,
	linux-mm, linux-block, linux-fsdevel, linux-kernel, Wysoczanski,
	Michal, Baldyga, Robert, Roman, Agnieszka

On Mon, Jun 06, 2016 at 07:29:42AM +0000, Barczak, Mariusz wrote:
> Hi, Let me elaborate problem in detail. 
> 
> For buffered IO data are copied into memory pages. For this case,
> the write IO is not submitted (generally). In the background opportunistic
> cleaning of dirty pages takes place and IO is generated to the
> device. An IO error is observed on this path and application
> is not informed about this. Summarizing flushing of dirty page fails.
> And probably, this page is dropped but in fact it should not be.
> So if above situation happens between application write and sync
> then no error is reported. In addition after some time, when the
> application reads the same LBA on which IO error occurred, old data
> content is fetched.

The application will be informed about it if it asks --- if it calls
fsync(), the I/O will be forced and if there is an error it will be
returned to the user.  But if the user has not asked, there is no way
for the user space to know that there is a problem --- for that
matter, it may have exited already by the time we do the buffered
writeback, so there may be nobody to inform.

If the error hapepns between the write and sync, then the address
space mapping's AS_EIO bit will be set.  (See filemap_check_errors()
and do a git grep on AS_EIO.)  So the user will be informed when they
call fsync(2).

The problem with simply not dropping the page is that if we do that,
the page will never be cleaned, and in the worst case, this can lead
to memory exhaustion.  Consider the case where a user is writing huge
numbers of pages, (e.g., dd if=/dev/zero
of=/dev/device-that-will-go-away) if the page is never dropped, then
the memory will never go away.

In other words, the current behavior was carefully considered, and
deliberately chosen as the best design.

The fact that you need to call fsync(2), and then check the error
returns of both fsync(2) *and* close(2) if you want to know for sure
whether or not there was an I/O error is a known, docmented part of
Unix/Linux and has been true for literally decades.  (With Emacs
learning and fixing this back in the late-1980's to avoid losing user
data if the user goes over quota on their Andrew File System on a BSD
4.3 system, for example.  If you're using some editor that comes with
some desktop package or some whizzy IDE, all bets are off, of course.
But if you're using such tools, you probably care about eye candy way
more than you care about your data; certainly the authors of such
programs seem to have this tendency, anyway.  :-)

Cheers,

						- Ted

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [BUG] Possible silent data corruption in filesystems/page cache
  2016-06-06 13:35     ` Theodore Ts'o
@ 2016-06-07  7:36       ` Barczak, Mariusz
  0 siblings, 0 replies; 5+ messages in thread
From: Barczak, Mariusz @ 2016-06-07  7:36 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Andreas Dilger, Andrew Morton, Jens Axboe, Alexander Viro,
	linux-mm, linux-block, linux-fsdevel, linux-kernel, Wysoczanski,
	Michal, Baldyga, Robert, Roman, Agnieszka

Hi Ted,
Thanks for your explanation which convinced me.
Regards,
Mariusz.

-----Original Message-----
From: Theodore Ts'o [mailto:tytso@mit.edu] 
Sent: Monday, June 6, 2016 15:36
To: Barczak, Mariusz <mariusz.barczak@intel.com>
Cc: Andreas Dilger <adilger@dilger.ca>; Andrew Morton <akpm@linux-foundation.org>; Jens Axboe <axboe@kernel.dk>; Alexander Viro <viro@zeniv.linux.org.uk>; linux-mm@kvack.org; linux-block@vger.kernel.org; linux-fsdevel@vger.kernel.org; linux-kernel@vger.kernel.org; Wysoczanski, Michal <michal.wysoczanski@intel.com>; Baldyga, Robert <robert.baldyga@intel.com>; Roman, Agnieszka <agnieszka.roman@intel.com>
Subject: Re: [BUG] Possible silent data corruption in filesystems/page cache

On Mon, Jun 06, 2016 at 07:29:42AM +0000, Barczak, Mariusz wrote:
> Hi, Let me elaborate problem in detail. 
> 
> For buffered IO data are copied into memory pages. For this case, the 
> write IO is not submitted (generally). In the background opportunistic 
> cleaning of dirty pages takes place and IO is generated to the device. 
> An IO error is observed on this path and application is not informed 
> about this. Summarizing flushing of dirty page fails.
> And probably, this page is dropped but in fact it should not be.
> So if above situation happens between application write and sync then 
> no error is reported. In addition after some time, when the 
> application reads the same LBA on which IO error occurred, old data 
> content is fetched.

The application will be informed about it if it asks --- if it calls fsync(), the I/O will be forced and if there is an error it will be returned to the user.  But if the user has not asked, there is no way for the user space to know that there is a problem --- for that matter, it may have exited already by the time we do the buffered writeback, so there may be nobody to inform.

If the error hapepns between the write and sync, then the address space mapping's AS_EIO bit will be set.  (See filemap_check_errors() and do a git grep on AS_EIO.)  So the user will be informed when they call fsync(2).

The problem with simply not dropping the page is that if we do that, the page will never be cleaned, and in the worst case, this can lead to memory exhaustion.  Consider the case where a user is writing huge numbers of pages, (e.g., dd if=/dev/zero
of=/dev/device-that-will-go-away) if the page is never dropped, then the memory will never go away.

In other words, the current behavior was carefully considered, and deliberately chosen as the best design.

The fact that you need to call fsync(2), and then check the error returns of both fsync(2) *and* close(2) if you want to know for sure whether or not there was an I/O error is a known, docmented part of Unix/Linux and has been true for literally decades.  (With Emacs learning and fixing this back in the late-1980's to avoid losing user data if the user goes over quota on their Andrew File System on a BSD
4.3 system, for example.  If you're using some editor that comes with some desktop package or some whizzy IDE, all bets are off, of course.
But if you're using such tools, you probably care about eye candy way more than you care about your data; certainly the authors of such programs seem to have this tendency, anyway.  :-)

Cheers,

						- Ted
--------------------------------------------------------------------

Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek
przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by
others is strictly prohibited.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2016-06-07  7:37 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-01  9:51 [BUG] Possible silent data corruption in filesystems/page cache Barczak, Mariusz
2016-06-02 19:32 ` Andreas Dilger
2016-06-06  7:29   ` Barczak, Mariusz
2016-06-06 13:35     ` Theodore Ts'o
2016-06-07  7:36       ` Barczak, Mariusz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).