[BUG] Possible silent data corruption in filesystems/page cache

* [BUG] Possible silent data corruption in filesystems/page cache
@ 2016-06-01  9:51 Barczak, Mariusz
  2016-06-02 19:32 ` Andreas Dilger
  0 siblings, 1 reply; 5+ messages in thread
From: Barczak, Mariusz @ 2016-06-01  9:51 UTC (permalink / raw)
  To: Andrew Morton, Jens Axboe, Alexander Viro, linux-mm, linux-block,
	linux-fsdevel, linux-kernel
  Cc: Wysoczanski, Michal, Baldyga, Robert, Barczak, Mariusz, Roman, Agnieszka

[-- Attachment #1: Type: text/plain, Size: 1428 bytes --]

Hi

We run data validation test for buffered workload on filesystems:
ext3, ext4, and XFS.
In context of flushing page cache block device driver returned IO error.
After dropping page cache our validation tool reported data corruption.

We provided a simple patch in order to inject IO error in device mapper.
We run test to verify md5sum of file during IO error.
Test shows checksum mismatch.

Attachments:
0001-drivers-md-dm-add-error-injection.patch - device mapper patch
dm-test.txt - validation test script

Regards,
Mariusz Barczak
Intel Technology Poland

--------------------------------------------------------------------

Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek
przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by
others is strictly prohibited.

[-- Attachment #2: 0001-drivers-md-dm-add-error-injection.patch --]
[-- Type: application/octet-stream, Size: 2351 bytes --]

From ca41f3f2ebeb2555ce18393f6ef98629b63a80db Mon Sep 17 00:00:00 2001
From: Robert Baldyga <robert.baldyga@intel.com>
Date: Wed, 1 Jun 2016 11:00:47 +0200
Subject: [DEBUG] drivers: md: dm: add error injection

This patch adds error injection mechanism using debugfs.
It allows to fail few BIO's for debug purposes.

Signed-off-by: Robert Baldyga <robert.baldyga@intel.com>
---
 drivers/md/dm.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1b2f962..15a70a2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -25,6 +25,7 @@
 #include <linux/elevator.h> /* for rq_end_sector() */
 #include <linux/blk-mq.h>
 #include <linux/pr.h>
+#include <linux/debugfs.h>
 
 #include <trace/events/block.h>
 
@@ -415,8 +416,47 @@ static void local_exit(void)
 	DMINFO("cleaned up");
 }
 
+static struct dentry *dbgfs_root;
+static u32 dbg_error_counter = 0;
+static u32 dbg_error_max = 1;
+static u32 dbg_error = 0;
+
+static int __init dm_debugfs_init(void)
+{
+	struct dentry *ret_d;
+
+	dbgfs_root = debugfs_create_dir("dm_debug", NULL);
+	if (!dbgfs_root)
+		return -ENOENT;
+
+	ret_d = debugfs_create_u32("error_counter", 0644, dbgfs_root, &dbg_error_counter);
+	if (!ret_d)
+		goto fail;
+
+	ret_d = debugfs_create_u32("error_max", 0644, dbgfs_root, &dbg_error_max);
+	if (!ret_d)
+		goto fail;
+
+	ret_d = debugfs_create_u32("error", 0644, dbgfs_root, &dbg_error);
+	if (!ret_d)
+		goto fail;
+
+	return 0;
+
+fail:
+	debugfs_remove_recursive(dbgfs_root);
+	dbgfs_root = NULL;
+	return -ENOENT;
+}
+
+static void dm_debugfs_exit(void)
+{
+	debugfs_remove_recursive(dbgfs_root);
+}
+
 static int (*_inits[])(void) __initdata = {
 	local_init,
+	dm_debugfs_init,
 	dm_target_init,
 	dm_linear_init,
 	dm_stripe_init,
@@ -428,6 +468,7 @@ static int (*_inits[])(void) __initdata = {
 
 static void (*_exits[])(void) = {
 	local_exit,
+	dm_debugfs_exit,
 	dm_target_exit,
 	dm_linear_exit,
 	dm_stripe_exit,
@@ -1781,6 +1822,14 @@ static void __split_and_process_bio(struct mapped_device *md,
 		return;
 	}
 
+	if (bio_data_dir(bio) == WRITE) {
+		if (dbg_error && !strncmp(current->comm, "kworker", 7) &&
+				(++dbg_error_counter < dbg_error_max)) {
+			bio_io_error(bio);
+			return;
+		}
+	}
+
 	ci.map = map;
 	ci.md = md;
 	ci.io = alloc_io(md);
-- 
1.9.1


[-- Attachment #3: dm-test.txt --]
[-- Type: text/plain, Size: 1553 bytes --]

#!/bin/bash

DISK=/dev/vda

FILE_SIZE=50

PART_NUM=10
PART_GB=2

prepare_test() {
	parted -s $DISK mktable gpt
	for i in `seq 1 1 $PART_NUM`; do
		parted -s -a optimal $DISK mkpart primary \
			$(( PART_GB * (i - 1) ))GiB $(( PART_GB * i ))GiB
	done

	for i in `seq 1 1 $PART_NUM`; do
		SIZE=`blockdev --getsize ${DISK}${i}`
		dmsetup create linear${i} --table \
			"0 ${SIZE} linear ${DISK}${i} 0"
	done

	sleep 2

	for i in `seq 1 1 $PART_NUM`; do
		mkfs.xfs -f /dev/dm-$(( i - 1 ))
	done

	for i in `seq 1 1 $PART_NUM`; do
		mkdir /mnt/p${i}
	done

	for i in `seq 1 1 $PART_NUM`; do
		mount /dev/dm-$(( i - 1 )) /mnt/p${i}
	done
}

cleanup_test() {
	for i in `seq 1 1 $PART_NUM`; do
		umount /mnt/p${i}
	done

	for i in `seq 1 1 $PART_NUM`; do
		rm -rf /mnt/p${i}
	done

	for i in `seq 1 1 $PART_NUM`; do
		dmsetup remove linear${i}
	done
}

inject_error() {
	while true; do
		echo "Error = 0"
		echo 0 > /sys/kernel/debug/dm_debug/error
		echo 0 > /sys/kernel/debug/dm_debug/error_counter
		echo 100 > /sys/kernel/debug/dm_debug/error_max
		sleep 1
		echo "Drop caches"
		echo 1 > /proc/sys/vm/drop_caches
		sleep 1
		echo "Error = 1"
		echo 1 > /sys/kernel/debug/dm_debug/error
		sleep 3
	done
}

inject_stop() {
	echo 0 > /sys/kernel/debug/dm_debug/error
}

run_test() {
	truncate -s 0 md5sum.log
	inject_error &
	PID=$!
	for i in `seq 1 1 $PART_NUM`; do
		dd if=/dev/urandom bs=1M count=$FILE_SIZE of=/mnt/p${i}/file
		md5sum /mnt/p${i}/file >> md5sum.log
	done
	kill $PID
	inject_stop
	md5sum -c md5sum.log
}

prepare_test
run_test
cleanup_test

^ permalink raw reply related	[flat|nested] 5+ messages in thread