* [PATCH 1/2] libext2fs: mmap io manager
2014-01-30 23:50 [INSANE RFC PATCH 0/2] e2fsck metadata prefetch Darrick J. Wong
@ 2014-01-30 23:50 ` Darrick J. Wong
2014-01-30 23:50 ` [PATCH 2/2] libext2fs/e2fsck: implement metadata prefetching Darrick J. Wong
1 sibling, 0 replies; 10+ messages in thread
From: Darrick J. Wong @ 2014-01-30 23:50 UTC (permalink / raw)
To: tytso, darrick.wong; +Cc: linux-ext4
Implement an IO manager that uses a gigantic mmap of the disk device.
This enables us to experiment with multithreaded metadata prefetch,
where we spawn a bunch of threads to issue a massive amount of IO to
fault in metadata.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
e2fsck/unix.c | 7 +
lib/ext2fs/Makefile.in | 8 +
lib/ext2fs/ext2_io.h | 3
lib/ext2fs/mmap_io.c | 534 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 551 insertions(+), 1 deletion(-)
create mode 100644 lib/ext2fs/mmap_io.c
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index 67d7384..eeeef7c 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -1255,7 +1255,12 @@ restart:
test_io_backing_manager = unix_io_manager;
} else
#endif
- io_ptr = unix_io_manager;
+ {
+ if (getenv("TEST_MMAP_IO"))
+ io_ptr = mmap_io_manager;
+ else
+ io_ptr = unix_io_manager;
+ }
flags |= EXT2_FLAG_NOFREE_ON_ERROR;
profile_get_boolean(ctx->profile, "options", "old_bitmaps", 0, 0,
&old_bitmaps);
diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index 29d3527..a1b5a01 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -67,6 +67,7 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
lookup.o \
mkdir.o \
mkjournal.o \
+ mmap_io.o \
mmp.o \
namei.o \
native.o \
@@ -143,6 +144,7 @@ SRCS= ext2_err.c \
$(srcdir)/lookup.c \
$(srcdir)/mkdir.c \
$(srcdir)/mkjournal.c \
+ $(srcdir)/mmap_io.c \
$(srcdir)/mmp.c \
$(srcdir)/namei.c \
$(srcdir)/native.c \
@@ -825,6 +827,12 @@ mkjournal.o: $(srcdir)/mkjournal.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/ext2_ext_attr.h \
$(srcdir)/bitops.h $(srcdir)/jfs_user.h $(srcdir)/kernel-jbd.h \
$(srcdir)/jfs_compat.h $(srcdir)/kernel-list.h
+mmap_io.o: $(srcdir)/mmap_io.c $(top_builddir)/lib/config.h \
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/ext2_fs.h \
+ $(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2fs.h \
+ $(srcdir)/ext2_fs.h $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h \
+ $(srcdir)/ext2_io.h $(top_builddir)/lib/ext2fs/ext2_err.h \
+ $(srcdir)/ext2_ext_attr.h $(srcdir)/bitops.h
mmp.o: $(srcdir)/mmp.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/dirpaths.h $(srcdir)/ext2_fs.h \
$(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2fs.h \
diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
index 1894fb8..39e0594 100644
--- a/lib/ext2fs/ext2_io.h
+++ b/lib/ext2fs/ext2_io.h
@@ -125,6 +125,9 @@ extern errcode_t io_channel_discard(io_channel channel,
extern errcode_t io_channel_alloc_buf(io_channel channel,
int count, void *ptr);
+/* mmap_io.c */
+extern io_manager mmap_io_manager;
+
/* unix_io.c */
extern io_manager unix_io_manager;
diff --git a/lib/ext2fs/mmap_io.c b/lib/ext2fs/mmap_io.c
new file mode 100644
index 0000000..37ca18b
--- /dev/null
+++ b/lib/ext2fs/mmap_io.c
@@ -0,0 +1,534 @@
+/*
+ * mmap_io.c --- This is the mmap implementation of the I/O manager.
+ *
+ * Copyright (C) 2014 by Oracle, Darrick J. Wong.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Library
+ * General Public License, version 2.
+ * %End-Header%
+ */
+
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "config.h"
+#include <stdio.h>
+#include <string.h>
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#if HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#include <fcntl.h>
+#include <time.h>
+#ifdef __linux__
+#include <sys/utsname.h>
+#endif
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+#if HAVE_LINUX_FALLOC_H
+#include <linux/falloc.h>
+#endif
+#include <sys/mman.h>
+#include <stdint.h>
+
+#if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
+#define BLKROGET _IO(0x12, 94) /* Get read-only status (0 = read_write). */
+#endif
+
+#undef ALIGN_DEBUG
+
+#include "ext2_fs.h"
+#include "ext2fs.h"
+
+/*
+ * For checking structure magic numbers...
+ */
+
+#define EXT2_CHECK_MAGIC(struct, code) \
+ if ((struct)->magic != (code)) return (code)
+
+struct mmap_private_data {
+ int magic;
+ int dev;
+ int flags;
+ int access_time;
+ ext2_loff_t offset;
+ void *bounce;
+ struct struct_io_stats io_stats;
+ void *map;
+ blk64_t length;
+};
+
+static errcode_t mmap_open(const char *name, int flags, io_channel *channel);
+static errcode_t mmap_close(io_channel channel);
+static errcode_t mmap_set_blksize(io_channel channel, int blksize);
+static errcode_t mmap_read_blk(io_channel channel, unsigned long block,
+ int count, void *data);
+static errcode_t mmap_write_blk(io_channel channel, unsigned long block,
+ int count, const void *data);
+static errcode_t mmap_flush(io_channel channel);
+static errcode_t mmap_write_byte(io_channel channel, unsigned long offset,
+ int size, const void *data);
+static errcode_t mmap_set_option(io_channel channel, const char *option,
+ const char *arg);
+static errcode_t mmap_get_stats(io_channel channel, io_stats *stats)
+;
+static errcode_t mmap_read_blk64(io_channel channel, unsigned long long block,
+ int count, void *data);
+static errcode_t mmap_write_blk64(io_channel channel, unsigned long long block,
+ int count, const void *data);
+static errcode_t mmap_discard(io_channel channel, unsigned long long block,
+ unsigned long long count);
+
+static struct struct_io_manager struct_mmap_manager = {
+ EXT2_ET_MAGIC_IO_MANAGER,
+ "MMAP I/O Manager",
+ mmap_open,
+ mmap_close,
+ mmap_set_blksize,
+ mmap_read_blk,
+ mmap_write_blk,
+ mmap_flush,
+ mmap_write_byte,
+ mmap_set_option,
+ mmap_get_stats,
+ mmap_read_blk64,
+ mmap_write_blk64,
+ mmap_discard,
+};
+
+io_manager mmap_io_manager = &struct_mmap_manager;
+
+static errcode_t mmap_get_stats(io_channel channel, io_stats *stats)
+{
+ errcode_t retval = 0;
+
+ struct mmap_private_data *data;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (stats)
+ *stats = &data->io_stats;
+
+ return retval;
+}
+
+/*
+ * Here are the raw I/O functions
+ */
+static errcode_t raw_read_blk(io_channel channel,
+ struct mmap_private_data *data,
+ unsigned long long block,
+ int count, void *bufv)
+{
+ ssize_t size;
+ ext2_loff_t location;
+ int actual = 0;
+ unsigned char *buf = bufv;
+
+ size = (count < 0) ? -count : count * channel->block_size;
+ data->io_stats.bytes_read += size;
+ location = ((ext2_loff_t) block * channel->block_size) + data->offset;
+ memcpy(buf, data->map + location, size);
+
+ return 0;
+}
+
+static errcode_t raw_write_blk(io_channel channel,
+ struct mmap_private_data *data,
+ unsigned long long block,
+ int count, const void *bufv)
+{
+ ssize_t size;
+ ext2_loff_t location;
+ int actual = 0;
+ const unsigned char *buf = bufv;
+
+ size = (count < 0) ? -count : count * channel->block_size;
+ data->io_stats.bytes_written += size;
+ location = ((ext2_loff_t) block * channel->block_size) + data->offset;
+ memcpy(data->map + location, buf, size);
+
+ return 0;
+}
+
+#ifdef __linux__
+#ifndef BLKDISCARDZEROES
+#define BLKDISCARDZEROES _IO(0x12, 124)
+#endif
+#endif
+
+static errcode_t mmap_open(const char *name, int flags, io_channel *channel)
+{
+ io_channel io = NULL;
+ struct mmap_private_data *data = NULL;
+ errcode_t retval;
+ int open_flags;
+ int f_nocache = 0;
+ ext2fs_struct_stat st;
+#ifdef __linux__
+ struct utsname ut;
+#endif
+
+ if (name == 0)
+ return EXT2_ET_BAD_DEVICE_NAME;
+ retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
+ if (retval)
+ goto cleanup;
+ memset(io, 0, sizeof(struct struct_io_channel));
+ io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
+ retval = ext2fs_get_mem(sizeof(struct mmap_private_data), &data);
+ if (retval)
+ goto cleanup;
+
+ io->manager = mmap_io_manager;
+ retval = ext2fs_get_mem(strlen(name)+1, &io->name);
+ if (retval)
+ goto cleanup;
+
+ strcpy(io->name, name);
+ io->private_data = data;
+ io->block_size = 1024;
+ io->read_error = 0;
+ io->write_error = 0;
+ io->refcount = 1;
+
+ memset(data, 0, sizeof(struct mmap_private_data));
+ data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
+ data->io_stats.num_fields = 2;
+ data->dev = -1;
+
+ open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
+ if (flags & IO_FLAG_EXCLUSIVE)
+ open_flags |= O_EXCL;
+#if defined(O_DIRECT)
+ if (flags & IO_FLAG_DIRECT_IO)
+ open_flags |= O_DIRECT;
+#elif defined(F_NOCACHE)
+ if (flags & IO_FLAG_DIRECT_IO)
+ f_nocache = F_NOCACHE;
+#endif
+ data->flags = flags;
+
+ data->dev = ext2fs_open_file(io->name, open_flags, 0);
+ if (data->dev < 0) {
+ retval = errno;
+ goto cleanup;
+ }
+ if (f_nocache) {
+ if (fcntl(data->dev, f_nocache, 1) < 0) {
+ retval = errno;
+ goto cleanup;
+ }
+ }
+
+ /*
+ * If the device is really a block device, then set the
+ * appropriate flag, otherwise we can set DISCARD_ZEROES flag
+ * because we are going to use punch hole instead of discard
+ * and if it succeed, subsequent read from sparse area returns
+ * zero.
+ */
+ if (ext2fs_stat(io->name, &st) == 0) {
+ if (S_ISBLK(st.st_mode))
+ io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
+ else
+ io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
+ }
+
+#ifdef BLKDISCARDZEROES
+ {
+ int zeroes = 0;
+ if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
+ zeroes)
+ io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
+ }
+#endif
+
+#ifdef BLKROGET
+ if (flags & IO_FLAG_RW) {
+ int error;
+ int readonly = 0;
+
+ /* Is the block device actually writable? */
+ error = ioctl(data->dev, BLKROGET, &readonly);
+ if (!error && readonly) {
+ retval = EPERM;
+ goto cleanup;
+ }
+ }
+#endif
+
+ retval = ext2fs_get_device_size2(name, 1024, &data->length);
+ if (retval)
+ goto cleanup;
+ if (data->length == 0) {
+ retval = EINVAL;
+ goto cleanup;
+ }
+ data->length *= 1024;
+ data->map = mmap(NULL, data->length,
+ PROT_READ | (flags & IO_FLAG_RW ? PROT_WRITE : 0),
+ MAP_SHARED, data->dev, 0);
+ if (data->map == MAP_FAILED) {
+ retval = errno;
+ goto cleanup;
+ }
+
+#ifdef __linux__
+#undef RLIM_INFINITY
+#if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
+#define RLIM_INFINITY ((unsigned long)(~0UL>>1))
+#else
+#define RLIM_INFINITY (~0UL)
+#endif
+ /*
+ * Work around a bug in 2.4.10-2.4.18 kernels where writes to
+ * block devices are wrongly getting hit by the filesize
+ * limit. This workaround isn't perfect, since it won't work
+ * if glibc wasn't built against 2.2 header files. (Sigh.)
+ *
+ */
+ if ((flags & IO_FLAG_RW) &&
+ (uname(&ut) == 0) &&
+ ((ut.release[0] == '2') && (ut.release[1] == '.') &&
+ (ut.release[2] == '4') && (ut.release[3] == '.') &&
+ (ut.release[4] == '1') && (ut.release[5] >= '0') &&
+ (ut.release[5] < '8')) &&
+ (ext2fs_stat(io->name, &st) == 0) &&
+ (S_ISBLK(st.st_mode))) {
+ struct rlimit rlim;
+
+ rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
+ setrlimit(RLIMIT_FSIZE, &rlim);
+ getrlimit(RLIMIT_FSIZE, &rlim);
+ if (((unsigned long) rlim.rlim_cur) <
+ ((unsigned long) rlim.rlim_max)) {
+ rlim.rlim_cur = rlim.rlim_max;
+ setrlimit(RLIMIT_FSIZE, &rlim);
+ }
+ }
+#endif
+ *channel = io;
+ return 0;
+
+cleanup:
+ if (data) {
+ if (data->dev >= 0)
+ close(data->dev);
+ ext2fs_free_mem(&data);
+ }
+ if (io) {
+ if (io->name)
+ ext2fs_free_mem(&io->name);
+ ext2fs_free_mem(&io);
+ }
+ return retval;
+}
+
+static errcode_t mmap_close(io_channel channel)
+{
+ struct mmap_private_data *data;
+ errcode_t retval = 0;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (--channel->refcount > 0)
+ return 0;
+
+ munmap(data->map, data->length);
+
+ if (close(data->dev) < 0)
+ retval = errno;
+
+ ext2fs_free_mem(&channel->private_data);
+ if (channel->name)
+ ext2fs_free_mem(&channel->name);
+ ext2fs_free_mem(&channel);
+ return retval;
+}
+
+static errcode_t mmap_set_blksize(io_channel channel, int blksize)
+{
+ struct mmap_private_data *data;
+ errcode_t retval;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ channel->block_size = blksize;
+ return 0;
+}
+
+
+static errcode_t mmap_read_blk64(io_channel channel, unsigned long long block,
+ int count, void *buf)
+{
+ struct mmap_private_data *data;
+ errcode_t retval;
+ char *cp;
+ int i, j;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ return raw_read_blk(channel, data, block, count, buf);
+}
+
+static errcode_t mmap_read_blk(io_channel channel, unsigned long block,
+ int count, void *buf)
+{
+ return mmap_read_blk64(channel, block, count, buf);
+}
+
+static errcode_t mmap_write_blk64(io_channel channel, unsigned long long block,
+ int count, const void *buf)
+{
+ struct mmap_private_data *data;
+ errcode_t retval = 0;
+ const char *cp;
+ int writethrough;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ return raw_write_blk(channel, data, block, count, buf);
+}
+
+static errcode_t mmap_write_blk(io_channel channel, unsigned long block,
+ int count, const void *buf)
+{
+ return mmap_write_blk64(channel, block, count, buf);
+}
+
+static errcode_t mmap_write_byte(io_channel channel, unsigned long offset,
+ int size, const void *buf)
+{
+ struct mmap_private_data *data;
+ errcode_t retval = 0;
+ ssize_t actual;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ memcpy(data->map + offset + data->offset, buf, size);
+ return 0;
+}
+
+/*
+ * Flush data buffers to disk.
+ */
+static errcode_t mmap_flush(io_channel channel)
+{
+ struct mmap_private_data *data;
+ errcode_t retval = 0;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ fsync(data->dev);
+ return retval;
+}
+
+static errcode_t mmap_set_option(io_channel channel, const char *option,
+ const char *arg)
+{
+ struct mmap_private_data *data;
+ unsigned long long tmp;
+ char *end;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (!strcmp(option, "offset")) {
+ if (!arg)
+ return EXT2_ET_INVALID_ARGUMENT;
+
+ tmp = strtoull(arg, &end, 0);
+ if (*end)
+ return EXT2_ET_INVALID_ARGUMENT;
+ data->offset = tmp;
+ if (data->offset < 0)
+ return EXT2_ET_INVALID_ARGUMENT;
+ return 0;
+ }
+ return EXT2_ET_INVALID_ARGUMENT;
+}
+
+#if defined(__linux__) && !defined(BLKDISCARD)
+#define BLKDISCARD _IO(0x12, 119)
+#endif
+
+static errcode_t mmap_discard(io_channel channel, unsigned long long block,
+ unsigned long long count)
+{
+ struct mmap_private_data *data;
+ int ret;
+
+ EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+ data = (struct mmap_private_data *) channel->private_data;
+ EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
+
+ if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
+#ifdef BLKDISCARD
+ __uint64_t range[2];
+
+ range[0] = (__uint64_t)(block) * channel->block_size;
+ range[1] = (__uint64_t)(count) * channel->block_size;
+
+ ret = ioctl(data->dev, BLKDISCARD, &range);
+#else
+ goto unimplemented;
+#endif
+ } else {
+#if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
+ /*
+ * If we are not on block device, try to use punch hole
+ * to reclaim free space.
+ */
+ ret = fallocate(data->dev,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ (off_t)(block) * channel->block_size,
+ (off_t)(count) * channel->block_size);
+#else
+ goto unimplemented;
+#endif
+ }
+ if (ret < 0) {
+ if (errno == EOPNOTSUPP)
+ goto unimplemented;
+ return errno;
+ }
+ return 0;
+unimplemented:
+ return EXT2_ET_UNIMPLEMENTED;
+}
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/2] libext2fs/e2fsck: implement metadata prefetching
2014-01-30 23:50 [INSANE RFC PATCH 0/2] e2fsck metadata prefetch Darrick J. Wong
2014-01-30 23:50 ` [PATCH 1/2] libext2fs: mmap io manager Darrick J. Wong
@ 2014-01-30 23:50 ` Darrick J. Wong
[not found] ` <45DEEA58-69FD-42EF-BB51-1A8D80000469@dilger.ca>
1 sibling, 1 reply; 10+ messages in thread
From: Darrick J. Wong @ 2014-01-30 23:50 UTC (permalink / raw)
To: tytso, darrick.wong; +Cc: linux-ext4
Use threads with our new mmap IO manager to prefetch metadata. This
results in a major e2fsck run time speedup. There's also a stupider
multiprocess version that works with the good old UNIX IO manager to
get pages into the page cache.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
e2fsck/unix.c | 13 +
lib/ext2fs/Makefile.in | 8 +
lib/ext2fs/ext2fs.h | 13 +
lib/ext2fs/prefetch.c | 456 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 490 insertions(+)
create mode 100644 lib/ext2fs/prefetch.c
diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index eeeef7c..33afc06 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -1181,6 +1181,7 @@ int main (int argc, char *argv[])
__u32 features[3];
char *cp;
int qtype = -99; /* quota type */
+ struct ext2fs_prefetch_handle *h = NULL;
clear_problem_context(&pctx);
sigcatcher_setup();
@@ -1638,9 +1639,21 @@ print_unsupp_features:
quota_init_context(&ctx->qctx, ctx->fs, qtype);
}
+ if (getenv("PREFETCH")) {
+ int flags = PREFETCH_INODES | PREFETCH_DIRS | PREFETCH_THREADED;
+ if (getenv("PREFETCH_WAIT"))
+ flags &= ~PREFETCH_THREADED;
+ retval = ext2fs_prefetch(fs, flags, &h);
+ if (retval)
+ com_err(ctx->program_name, retval, "prefetching");
+ }
+
run_result = e2fsck_run(ctx);
e2fsck_clear_progbar(ctx);
+ if (h)
+ ext2fs_prefetch_free(&h);
+
if (ctx->flags & E2F_FLAG_JOURNAL_INODE) {
if (fix_problem(ctx, PR_6_RECREATE_JOURNAL, &pctx)) {
if (journal_size < 1024)
diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index a1b5a01..9fbf2b5 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -73,6 +73,7 @@ OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
native.o \
newdir.o \
openfs.o \
+ prefetch.o \
progress.o \
punch.o \
qcow2.o \
@@ -150,6 +151,7 @@ SRCS= ext2_err.c \
$(srcdir)/native.c \
$(srcdir)/newdir.c \
$(srcdir)/openfs.c \
+ $(srcdir)/prefetch.c \
$(srcdir)/progress.c \
$(srcdir)/punch.c \
$(srcdir)/qcow2.c \
@@ -863,6 +865,12 @@ openfs.o: $(srcdir)/openfs.c $(top_builddir)/lib/config.h \
$(srcdir)/ext2_fs.h $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h \
$(srcdir)/ext2_io.h $(top_builddir)/lib/ext2fs/ext2_err.h \
$(srcdir)/ext2_ext_attr.h $(srcdir)/bitops.h $(srcdir)/e2image.h
+prefetch.o: $(srcdir)/prefetch.c $(top_builddir)/lib/config.h \
+ $(top_builddir)/lib/dirpaths.h $(srcdir)/ext2fs.h \
+ $(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2_fs.h \
+ $(srcdir)/ext3_extents.h $(top_srcdir)/lib/et/com_err.h $(srcdir)/ext2_io.h \
+ $(top_builddir)/lib/ext2fs/ext2_err.h $(srcdir)/ext2_ext_attr.h \
+ $(srcdir)/bitops.h $(srcdir)/ext2fsP.h
progress.o: $(srcdir)/progress.c $(top_builddir)/lib/config.h \
$(top_builddir)/lib/dirpaths.h $(srcdir)/ext2fs.h \
$(top_builddir)/lib/ext2fs/ext2_types.h $(srcdir)/ext2_fs.h \
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index ba5c388..e634d2c 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -1522,6 +1522,19 @@ errcode_t ext2fs_mmp_update2(ext2_filsys fs, int immediately);
errcode_t ext2fs_mmp_stop(ext2_filsys fs);
unsigned ext2fs_mmp_new_seq(void);
+/* prefetch.c */
+#define PREFETCH_THREADED (0x00000001)
+#define PREFETCH_ERROR_ABORT (0x00000002)
+#define PREFETCH_BITMAPS (0x00000004)
+#define PREFETCH_INODES (0x00000008)
+#define PREFETCH_MAPS (0x00000010)
+#define PREFETCH_DIRS (0x00000020)
+struct ext2fs_prefetch_handle;
+errcode_t ext2fs_prefetch(ext2_filsys fs, int flags,
+ struct ext2fs_prefetch_handle **h);
+errcode_t ext2fs_prefetch_wait(struct ext2fs_prefetch_handle *h);
+errcode_t ext2fs_prefetch_free(struct ext2fs_prefetch_handle **h);
+
/* read_bb.c */
extern errcode_t ext2fs_read_bb_inode(ext2_filsys fs,
ext2_badblocks_list *bb_list);
diff --git a/lib/ext2fs/prefetch.c b/lib/ext2fs/prefetch.c
new file mode 100644
index 0000000..022af41
--- /dev/null
+++ b/lib/ext2fs/prefetch.c
@@ -0,0 +1,456 @@
+/*
+ * prefetch.c --- Prefetch filesystem metadata.
+ *
+ * Copyright (C) 2014 by Oracle, Darrick J. Wong.
+ *
+ * %Begin-Header%
+ * This file may be redistributed under the terms of the GNU Library
+ * General Public License, version 2.
+ * %End-Header%
+ */
+
+#define _LARGEFILE_SOURCE
+#define _LARGEFILE64_SOURCE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <pthread.h>
+
+#include "config.h"
+#include "ext2_fs.h"
+#include "ext2fs.h"
+
+#define USE_THREADS 1
+#define USE_SUPER 1
+
+struct ext2fs_prefetch_handle {
+ ext2_filsys fs;
+ int flags;
+ int done;
+ pid_t pid;
+#ifdef USE_THREADS
+ pthread_t tid;
+#endif
+};
+
+static int ignore_block(ext2_filsys fs, blk64_t *blocknr, e2_blkcnt_t blockcnt,
+ blk64_t ref_blk, int ref_offset, void *priv_data)
+{
+ return 0;
+}
+
+struct dirent_iterate {
+ void *buf;
+ int flags;
+};
+
+static int dirent_block(ext2_filsys fs, blk64_t *blocknr, e2_blkcnt_t blockcnt,
+ blk64_t ref_blk, int ref_offset, void *priv_data)
+{
+ struct dirent_iterate *di = priv_data;
+ errcode_t err;
+
+ err = io_channel_read_blk64(fs->io, *blocknr, 1, di->buf);
+ if (err && (di->flags & PREFETCH_ERROR_ABORT))
+ return BLOCK_ABORT;
+ return 0;
+}
+
+/*
+ * First dumb prefetch implementation: Separate process, just read data to
+ * get it into the page cache, at least.
+ */
+static void do_ext2fs_prefetch(ext2_filsys fs, int flags)
+{
+ void *buf;
+ blk64_t blk;
+ dgrp_t i;
+ ext2_inode_scan scan;
+ int length = EXT2_INODE_SIZE(fs->super);
+ ext2_ino_t ino;
+ errcode_t err;
+ struct ext2_inode inode;
+ struct dirent_iterate di;
+ int iter_flags;
+ unsigned int blocks_to_read;
+
+ err = ext2fs_get_array(fs->blocksize, fs->inode_blocks_per_group, &buf);
+ if (err)
+ return;
+
+ /* load bitmaps */
+ if (!(flags & PREFETCH_BITMAPS))
+ goto skip_bitmaps;
+ err = ext2fs_read_bitmaps(fs);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ goto out;
+
+skip_bitmaps:
+ /* load inode tables */
+ if (!(flags & PREFETCH_INODES) || (flags & (PREFETCH_MAPS |
+ PREFETCH_DIRS)))
+ goto skip_itable;
+
+ for (i = 0; i < fs->group_desc_count; i++) {
+ if (ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT))
+ continue;
+
+ blocks_to_read = fs->inode_blocks_per_group;
+ if (ext2fs_has_group_desc_csum(fs)) {
+ unsigned int num_inodes =
+ fs->super->s_inodes_per_group -
+ ext2fs_bg_itable_unused(fs, i);
+ blocks_to_read = (num_inodes *
+ EXT2_INODE_SIZE(fs->super)) /
+ fs->blocksize;
+ }
+
+ blk = ext2fs_inode_table_loc(fs, i);
+ err = io_channel_read_blk64(fs->io, blk, blocks_to_read, buf);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ goto out;
+ }
+
+skip_itable:
+ /* load inodes */
+ if (!(flags & (PREFETCH_MAPS | PREFETCH_DIRS)))
+ goto skip_inodes;
+
+ err = ext2fs_open_inode_scan(fs, 0, &scan);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ goto out;
+
+ di.buf = buf;
+ di.flags = flags;
+ do {
+ err = ext2fs_get_next_inode_full(scan, &ino, &inode,
+ sizeof(inode));
+ if (err)
+ break;
+ if (!ino)
+ break;
+ if (!ext2fs_test_inode_bitmap2(fs->inode_map, ino))
+ continue;
+
+ iter_flags = BLOCK_FLAG_READ_ONLY | BLOCK_FLAG_DATA_ONLY;
+ if ((flags & PREFETCH_MAPS) &&
+ !(flags & PREFETCH_DIRS) &&
+ (LINUX_S_ISREG(inode.i_mode) ||
+ LINUX_S_ISLNK(inode.i_mode))) {
+ err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+ ignore_block, &di);
+ } else if ((flags & PREFETCH_DIRS) &&
+ !(flags & PREFETCH_MAPS) &&
+ LINUX_S_ISDIR(inode.i_mode)) {
+ err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+ dirent_block, &di);
+ } else {
+ int (*func)(ext2_filsys fs, blk64_t *blocknr,
+ e2_blkcnt_t blockcnt, blk64_t ref_blk,
+ int ref_offset, void *priv_data) =
+ LINUX_S_ISDIR(inode.i_mode) ? dirent_block :
+ ignore_block;
+ err = ext2fs_block_iterate3(fs, ino, iter_flags, NULL,
+ func, &di);
+ }
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ break;
+
+ blk = ext2fs_file_acl_block(fs, &inode);
+ if (!blk)
+ continue;
+ err = io_channel_read_blk64(fs->io, blk, 1, buf);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ break;
+ } while (ino);
+
+out2:
+ ext2fs_close_inode_scan(scan);
+
+skip_inodes:
+out:
+ ext2fs_free_mem(&buf);
+
+ return;
+}
+
+static void *prefetch_thread(void *data)
+{
+ struct ext2fs_prefetch_handle *pd = data;
+ do_ext2fs_prefetch(pd->fs, pd->flags);
+ return NULL;
+}
+
+/*
+ * Second, less dumb prefetch: Use threads to preload metadata in group order.
+ */
+struct super_entry {
+ dgrp_t group;
+ ext2_ino_t num_inodes;
+};
+
+struct super_thread {
+ ext2_filsys fs;
+ pthread_t tid;
+ int flags;
+ struct super_entry *start, *end;
+ unsigned int skip_factor;
+ void *buf;
+};
+
+static void *super_func(void *data)
+{
+ struct super_thread *t = data;
+ ext2_filsys fs = t->fs;
+ int flags = t->flags;
+ void *buf = t->buf;
+ struct super_entry *e;
+ unsigned int blocks_to_read;
+ blk64_t blk;
+ ext2_ino_t i, ino;
+ unsigned int nr_read = 0;
+ struct dirent_iterate di;
+ struct ext2_inode inode;
+ int iter_flags;
+ errcode_t err;
+
+ /* Read the inode tables */
+ for (e = t->start; e < t->end; e += t->skip_factor) {
+ blocks_to_read = (e->num_inodes *
+ EXT2_INODE_SIZE(fs->super)) / fs->blocksize;
+ blk = ext2fs_inode_table_loc(fs, e->group);
+ err = io_channel_read_blk64(fs->io, blk, blocks_to_read, buf);
+ if (err && (flags & PREFETCH_ERROR_ABORT))
+ continue;
+ }
+
+ /* Scan inodes for extent/dir blocks */
+ di.buf = buf;
+ di.flags = flags;
+ for (e = t->start; e < t->end; e += t->skip_factor) {
+ for (i = 0; i < e->num_inodes; i++) {
+ ino = e->group * fs->super->s_inodes_per_group + i;
+ err = ext2fs_read_inode(fs, ino, &inode);
+ if (err)
+ continue;
+ /* Skip unlinked or unknown-type inodes */
+ if (!inode.i_links_count ||
+ (inode.i_mode & 0xF000) == 0)
+ continue;
+
+ iter_flags = BLOCK_FLAG_READ_ONLY |
+ BLOCK_FLAG_DATA_ONLY;
+ if ((flags & PREFETCH_MAPS) &&
+ !(flags & PREFETCH_DIRS) &&
+ (LINUX_S_ISREG(inode.i_mode) ||
+ LINUX_S_ISLNK(inode.i_mode))) {
+ err = ext2fs_block_iterate3(fs, ino,
+ iter_flags, NULL,
+ ignore_block, &di);
+ } else if ((flags & PREFETCH_DIRS) &&
+ !(flags & PREFETCH_MAPS) &&
+ LINUX_S_ISDIR(inode.i_mode)) {
+ err = ext2fs_block_iterate3(fs, ino,
+ iter_flags, NULL,
+ dirent_block, &di);
+ } else {
+ int (*func)(ext2_filsys fs, blk64_t *blocknr,
+ e2_blkcnt_t blockcnt,
+ blk64_t ref_blk,
+ int ref_offset, void *priv_data) =
+ LINUX_S_ISDIR(inode.i_mode) ? dirent_block :
+ ignore_block;
+ err = ext2fs_block_iterate3(fs, ino,
+ iter_flags, NULL,
+ func, &di);
+ }
+
+ blk = ext2fs_file_acl_block(fs, &inode);
+ if (!blk)
+ continue;
+ err = io_channel_read_blk64(fs->io, blk, 1, buf);
+ }
+ }
+
+ return NULL;
+}
+
+static void *super_prefetch(void *data)
+{
+ struct ext2fs_prefetch_handle *pd = data;
+ ext2_filsys fs = pd->fs;
+ int flags = pd->flags;
+ void *b, *r;
+ struct super_thread *threads = NULL, *t;
+ unsigned int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
+ unsigned int j;
+ struct super_entry *entries = NULL, *e = NULL;
+ unsigned int num_entries = 0;
+ dgrp_t i;
+ ext2_ino_t num_inodes;
+ errcode_t err;
+
+ /* Find all non-empty groups */
+ for (i = 0; i < fs->group_desc_count; i++) {
+ if (ext2fs_bg_flags_test(fs, i, EXT2_BG_INODE_UNINIT))
+ continue;
+
+ num_inodes = fs->super->s_inodes_per_group;
+ if (ext2fs_bg_free_inodes_count(fs, i) == num_inodes)
+ continue;
+ if (ext2fs_has_group_desc_csum(fs))
+ num_inodes -= ext2fs_bg_itable_unused(fs, i);
+ if (e == entries + num_entries) {
+ r = realloc(entries, (num_entries + 32) *
+ sizeof(*entries));
+ if (r == NULL) {
+ err = errno;
+ goto out;
+ }
+ entries = r;
+ e = entries + num_entries;
+ num_entries += 32;
+ }
+ e->group = i;
+ e->num_inodes = num_inodes;
+ e++;
+ }
+ num_entries = e - entries;
+
+ /* Set up the threads */
+ if (getenv("PREFETCH_THREADS")) {
+ j = atoi(getenv("PREFETCH_THREADS"));
+ if (j > 0)
+ num_threads = j;
+ }
+
+ err = ext2fs_get_arrayzero(num_threads, sizeof(*threads) +
+ (fs->blocksize * fs->inode_blocks_per_group),
+ &b);
+ if (err)
+ goto out;
+ threads = b + (fs->blocksize * fs->inode_blocks_per_group *
+ num_threads);
+
+ for (j = 0, t = threads, e = entries; j < num_threads; j++, t++, e++) {
+ t->fs = fs;
+ t->flags = flags;
+ t->start = e;
+ t->end = entries + num_entries;
+ t->skip_factor = num_threads;
+ err = ext2fs_dup_handle(fs, &t->fs);
+ if (err)
+ goto out2;
+ t->fs->icache = NULL;
+ t->buf = b + (fs->blocksize * fs->inode_blocks_per_group * j);
+ pthread_create(&t->tid, NULL, super_func, t);
+ }
+
+ /* Wait for threads */
+ for (j = 0, t = threads; j < num_threads; j++, t++)
+ pthread_join(t->tid, NULL);
+ pd->done = 1;
+out2:
+ ext2fs_free_mem(&b);
+out:
+ free(entries);
+ return NULL;
+}
+
+struct unix_private_data_hack {
+ int magic;
+ int dev;
+};
+
+errcode_t ext2fs_prefetch(ext2_filsys fs, int flags,
+ struct ext2fs_prefetch_handle **h)
+{
+ struct ext2fs_prefetch_handle *pd;
+ errcode_t err;
+
+ err = ext2fs_get_memzero(sizeof(*pd), &pd);
+ if (err)
+ return err;
+ pd->fs = fs;
+ pd->flags = flags;
+
+ /* Load the rest */
+ if (flags & PREFETCH_THREADED) {
+ if (fs->io->manager == mmap_io_manager) {
+#if USE_SUPER
+ struct timespec ts;
+ err = pthread_create(&pd->tid, NULL, super_prefetch,
+ pd);
+ if (err)
+ goto errout;
+ ts.tv_sec = 0; ts.tv_nsec = 500000;
+ nanosleep(&ts, NULL);
+#elif USE_THREADS
+ err = pthread_create(&pd->tid, NULL, prefetch_thread,
+ pd);
+ if (err)
+ goto errout;
+#else
+ goto single_thread;
+#endif
+ } else if (fs->io->manager == unix_io_manager) {
+ pd->pid = fork();
+ if (pd->pid < 0) {
+ err = errno;
+ goto errout;
+ } else if (pd->pid == 0) {
+ struct unix_private_data_hack *m =
+ fs->io->private_data;
+ m->dev = open(fs->device_name, O_RDONLY);
+ do_ext2fs_prefetch(fs, flags);
+ exit(0);
+ }
+ }
+ } else {
+single_thread:
+#if USE_SUPER
+ super_prefetch(pd);
+#else
+ do_ext2fs_prefetch(fs, flags);
+#endif
+ pd->done = 1;
+ }
+ *h = pd;
+
+ return 0;
+errout:
+ ext2fs_free_mem(&pd);
+ return err;
+}
+
+errcode_t ext2fs_prefetch_wait(struct ext2fs_prefetch_handle *h)
+{
+ pid_t ret;
+ int status;
+
+ if (h->flags & PREFETCH_THREADED && h->done != 0) {
+ if (h->tid)
+ ret = pthread_join(h->tid, NULL);
+ if (h->pid) {
+ ret = waitpid(h->pid, &status, WNOHANG);
+ if (ret == 0)
+ kill(h->pid, SIGKILL);
+ waitpid(h->pid, NULL, 0);
+ }
+ }
+ h->done = 1;
+ return 0;
+}
+
+errcode_t ext2fs_prefetch_free(struct ext2fs_prefetch_handle **h)
+{
+ ext2fs_prefetch_wait(*h);
+ return ext2fs_free_mem(h);
+}
^ permalink raw reply related [flat|nested] 10+ messages in thread