All of lore.kernel.org
 help / color / mirror / Atom feed
From: Dominique MARTINET <dominique.martinet@atmark-techno.com>
To: Nikolay Borisov <nborisov@suse.com>, Jens Axboe <axboe@kernel.dk>
Cc: io-uring@vger.kernel.org, linux-btrfs@vger.kernel.org
Subject: Re: read corruption with qemu master io_uring engine / linux master / btrfs(?)
Date: Wed, 29 Jun 2022 14:14:18 +0900	[thread overview]
Message-ID: <Yrvfqh0eqN0J5T6V@atmark-techno.com> (raw)
In-Reply-To: <YrueYDXqppHZzOsy@atmark-techno.com>

[-- Attachment #1: Type: text/plain, Size: 2008 bytes --]

Dominique MARTINET wrote on Wed, Jun 29, 2022 at 09:35:44AM +0900:
> I also agree writing a simple program like the io_uring test in the
> above commit that'd sort of do it like qemu and compare contents would
> be ideal.
> I'll have a stab at this today.

Okay, after half a day failing to reproduce I had a closer look at qemu
and... it's a qemu bug.

Well, there probably are two bugs, but one should be benign:

 - qemu short read handling was... rather disappointing.
Patch should appear here[1] eventually, but as it seems moderated?
I'm reposting it here:
-----
diff --git a/block/io_uring.c b/block/io_uring.c
index d48e472e74cb..d58aff9615ce 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -103,7 +103,7 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
                       remaining);
 
     /* Update sqe */
-    luringcb->sqeq.off = nread;
+    luringcb->sqeq.off += nread;
     luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov;
     luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
 
-----
 (basically "just" a typo, but that must have never been tested!)
[1] https://lore.kernel.org/qemu-devel/20220629044957.1998430-1-dominique.martinet@atmark-techno.com


 - comments there also say short reads should never happen on newer
kernels (assuming local filesystems?) -- how true is that? If we're
doing our best kernel side to avoid short reads I guess we probably
ought to have a look at this.

It can easily be reproduced with a simple io_uring program -- see
example attached that eventually fails with the following error on
btrfs:
bad read result for io 8, offset 792227840: 266240 should be 1466368

but doesn't fail on tmpfs or without O_DIRECT.
feel free to butcher it, it's already a quickly hacked downversion of my
original test that had hash computation etc so the flow might feel a bit
weird.
Just compile with `gcc -o shortreads uring_shortreads.c -luring` and run
with file to read in argument.


Thanks!
-- 
Dominique

[-- Attachment #2: uring_shortreads.c --]
[-- Type: text/x-csrc, Size: 3359 bytes --]

/* Get O_DIRECT */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <liburing.h>
#include <sys/random.h>
#include <sys/stat.h>

long pagesize;
size_t n_blocks;
#define QUEUE_SIZE 10
char *buffers[QUEUE_SIZE];
int bufsize[QUEUE_SIZE];
struct iovec iovec[QUEUE_SIZE];
long int offsets[QUEUE_SIZE];

void breakme(void) {
}

int submit_read(struct io_uring *ring, int fd, int i) {
	struct io_uring_sqe *sqe;
	int ret;

	sqe = io_uring_get_sqe(ring);
	if (!sqe) {
		fprintf(stderr, "Failed to get io_uring sqe\n");
		return 1;
	}
	if (i == 0 || rand() % 2 == 0 || offsets[i-1] > n_blocks - bufsize[i]) {
		offsets[i] = rand() % (n_blocks - bufsize[i] + 1);
	} else {
		offsets[i] = offsets[i - 1];
	}
	io_uring_prep_readv(sqe, fd, iovec + i, 1, offsets[i] * pagesize);
	io_uring_sqe_set_data(sqe, (void*)(uintptr_t)i);
	ret = io_uring_submit(ring);
	if (ret != 1) {
		fprintf(stderr,	"submit failed\n");
		return 1;
	}
	return 0;
}

int getsize(int fd) {
	struct stat sb;
	if (fstat(fd, &sb)) {
		fprintf(stderr, "fstat: %m\n");
		return 1;
	}
	n_blocks = sb.st_size / pagesize;
	return 0;
}

int main(int argc, char *argv[])
{
	char *file, *mapfile;
	unsigned int seed;
	struct io_uring ring;
	struct io_uring_cqe *cqe;
	int fd, i;
	ssize_t ret;
	size_t total = 0;

	if (argc < 2 || argc > 3) {
		fprintf(stderr, "Use: %s <file> [<seed>]\n", argv[0]);
		return 1;
	}
	file = argv[1];
	if (argc == 3) {
		seed = atol(argv[2]);
	} else {
		getrandom(&seed, sizeof(seed), 0);
	}
	printf("random seed %u\n", seed);
	srand(seed);
	pagesize = sysconf(_SC_PAGE_SIZE);
	if (asprintf(&mapfile, "%s.map", file) < 0) {
		fprintf(stderr, "asprintf map %d\n", errno);
		return 1;
	}

	fd = open(file, O_RDONLY | O_DIRECT);
	if (fd == -1) {
		fprintf(stderr,
				"Failed to open file '%s': %s (errno %d)\n",
				file, strerror(errno), errno);
		return 1;
	}
	if (getsize(fd))
		return 1;

	for (i = 0 ; i < QUEUE_SIZE; i++) {
		bufsize[i] = (rand() % 1024) + 1;
		ret = posix_memalign((void**)&buffers[i], pagesize, bufsize[i] * pagesize);
		if (ret) {
			fprintf(stderr, "Failed to allocate read buffer\n");
			return 1;
		}
	}


	printf("Starting io_uring reads...\n");


	ret = io_uring_queue_init(QUEUE_SIZE, &ring, 0);
	if (ret != 0) {
		fprintf(stderr, "Failed to create io_uring queue\n");
		return 1;
	}


	for (i = 0 ; i < QUEUE_SIZE; i++) {
		iovec[i].iov_base = buffers[i];
		iovec[i].iov_len = bufsize[i] * pagesize;
		if (submit_read(&ring, fd, i))
			return 1;
	}

	while (total++ < 10000000) {
		if (total % 1000 == 0)
			printf("%zd\n", total);
		ret = io_uring_wait_cqe(&ring, &cqe);
		if (ret < 0) {
			fprintf(stderr, "Failed at io_uring_wait_cqe()\n");
			return 1;
		}
		i = (intptr_t)io_uring_cqe_get_data(cqe);
		if (cqe->res < 0) {
			fprintf(stderr, "bad read result for io %d, offset %zd: %d\n",
				i, offsets[i] * pagesize, cqe->res);
			breakme();
			return 1;
		}
		if (cqe->res != bufsize[i] * pagesize) {
			fprintf(stderr, "bad read result for io %d, offset %zd: %d should be %zd\n",
				i, offsets[i] * pagesize, cqe->res, bufsize[i] * pagesize);
			breakme();
			return 1;
		}
		io_uring_cqe_seen(&ring, cqe);

		// resubmit
		if (submit_read(&ring, fd, i))
			return 1;
	}
	io_uring_queue_exit(&ring);

	return 0;
}

  reply	other threads:[~2022-06-29  5:14 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-28  9:08 read corruption with qemu master io_uring engine / linux master / btrfs(?) Dominique MARTINET
2022-06-28 19:03 ` Nikolay Borisov
2022-06-29  0:35   ` Dominique MARTINET
2022-06-29  5:14     ` Dominique MARTINET [this message]
2022-06-29 15:37       ` Filipe Manana
2022-06-30  0:41         ` Dominique MARTINET
2022-06-30  7:56           ` Dominique MARTINET
2022-06-30 10:45             ` Filipe Manana
2022-06-30 11:09               ` Filipe Manana
2022-06-30 11:27               ` Dominique MARTINET
2022-06-30 12:51                 ` Filipe Manana
2022-06-30 13:08                   ` Dominique MARTINET
2022-06-30 15:10                     ` Filipe Manana
2022-07-01  1:25                       ` Dominique MARTINET
2022-07-01  8:45                         ` Filipe Manana
2022-07-01 10:33                           ` Filipe Manana
2022-07-01 23:48                             ` Dominique MARTINET
2022-06-28 19:05 ` Nikolay Borisov
2022-06-28 19:12 ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Yrvfqh0eqN0J5T6V@atmark-techno.com \
    --to=dominique.martinet@atmark-techno.com \
    --cc=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=nborisov@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.