From: ezemtsov@google.com
To: linux-fsdevel@vger.kernel.org
Cc: tytso@mit.edu, Eugene Zemtsov <ezemtsov@google.com>
Subject: [PATCH 2/6] incfs: Backing file format
Date: Wed, 1 May 2019 21:03:27 -0700 [thread overview]
Message-ID: <20190502040331.81196-3-ezemtsov@google.com> (raw)
In-Reply-To: <20190502040331.81196-1-ezemtsov@google.com>
From: Eugene Zemtsov <ezemtsov@google.com>
- Read and write logic for ondisk backing file format (aka incfs image)
- Add format.c and format.h
- Utils in internal.h
Signed-off-by: Eugene Zemtsov <ezemtsov@google.com>
---
fs/incfs/Makefile | 2 +-
fs/incfs/format.c | 554 ++++++++++++++++++++++++++++++++++++++++++++
fs/incfs/format.h | 294 +++++++++++++++++++++++
fs/incfs/internal.h | 31 +++
4 files changed, 880 insertions(+), 1 deletion(-)
create mode 100644 fs/incfs/format.c
create mode 100644 fs/incfs/format.h
create mode 100644 fs/incfs/internal.h
diff --git a/fs/incfs/Makefile b/fs/incfs/Makefile
index 7892196c634f..cdea18c7213e 100644
--- a/fs/incfs/Makefile
+++ b/fs/incfs/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_INCREMENTAL_FS) += incrementalfs.o
-incrementalfs-y := main.o vfs.o
\ No newline at end of file
+incrementalfs-y := main.o vfs.o format.o
diff --git a/fs/incfs/format.c b/fs/incfs/format.c
new file mode 100644
index 000000000000..a0e6ecec09d3
--- /dev/null
+++ b/fs/incfs/format.c
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018 Google LLC
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/falloc.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+
+#include "format.h"
+
+struct backing_file_context *incfs_alloc_bfc(struct file *backing_file)
+{
+ struct backing_file_context *result = NULL;
+
+ result = kzalloc(sizeof(*result), GFP_NOFS);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+
+ result->bc_file = backing_file;
+ mutex_init(&result->bc_mutex);
+ return result;
+}
+
+void incfs_free_bfc(struct backing_file_context *bfc)
+{
+ if (!bfc)
+ return;
+
+ if (bfc->bc_file)
+ fput(bfc->bc_file);
+
+ mutex_destroy(&bfc->bc_mutex);
+ kfree(bfc);
+}
+
+loff_t incfs_get_end_offset(struct file *f)
+{
+ /*
+ * This function assumes that file size and the end-offset
+ * are the same. This is not always true.
+ */
+ return i_size_read(file_inode(f));
+}
+
+/*
+ * Truncate the tail of the file to the given length.
+ * Used to rollback partially successful multistep writes.
+ */
+static int truncate_backing_file(struct backing_file_context *bfc,
+ loff_t new_end)
+{
+ struct inode *inode = NULL;
+ struct dentry *dentry = NULL;
+ loff_t old_end = 0;
+ struct iattr attr;
+ int result = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ if (!bfc->bc_file)
+ return -EFAULT;
+
+ old_end = incfs_get_end_offset(bfc->bc_file);
+ if (old_end == new_end)
+ return 0;
+ if (old_end < new_end)
+ return -EINVAL;
+
+ inode = bfc->bc_file->f_inode;
+ dentry = bfc->bc_file->f_path.dentry;
+
+ attr.ia_size = new_end;
+ attr.ia_valid = ATTR_SIZE;
+
+ inode_lock(inode);
+ result = notify_change(dentry, &attr, NULL);
+ inode_unlock(inode);
+
+ return result;
+}
+
+/* Append a given number of zero bytes to the end of the backing file. */
+static int append_zeros(struct backing_file_context *bfc, size_t len)
+{
+ loff_t file_size = 0;
+ loff_t new_last_byte_offset = 0;
+ int res = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ if (len == 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ /*
+ * Allocate only one byte at the new desired end of the file.
+ * It will increase file size and create a zeroed area of
+ * a given size.
+ */
+ file_size = incfs_get_end_offset(bfc->bc_file);
+ new_last_byte_offset = file_size + len - 1;
+ res = vfs_fallocate(bfc->bc_file, 0, new_last_byte_offset, 1);
+ if (res)
+ return res;
+
+ res = vfs_fsync_range(bfc->bc_file, file_size, file_size + len, 1);
+ return res;
+}
+
+static int write_to_bf(struct backing_file_context *bfc, const void *buf,
+ size_t count, loff_t pos, bool sync)
+{
+ ssize_t res = 0;
+ loff_t p = pos;
+
+ res = kernel_write(bfc->bc_file, buf, count, &p);
+ if (res < 0)
+ return res;
+ if (res != count)
+ return -EIO;
+
+ if (sync)
+ return vfs_fsync_range(bfc->bc_file, pos, pos + count, 1);
+
+ return 0;
+}
+
+static u32 calc_md_crc(struct incfs_md_header *record)
+{
+ u32 result = 0;
+ __le32 saved_crc = record->h_record_crc;
+ __le64 saved_md_offset = record->h_next_md_offset;
+ size_t record_size = min_t(size_t, le16_to_cpu(record->h_record_size),
+ INCFS_MAX_METADATA_RECORD_SIZE);
+
+ /* Zero fields which needs to be excluded from CRC calculation. */
+ record->h_record_crc = 0;
+ record->h_next_md_offset = 0;
+ result = crc32(0, record, record_size);
+
+ /* Restore excluded fields. */
+ record->h_record_crc = saved_crc;
+ record->h_next_md_offset = saved_md_offset;
+
+ return result;
+}
+
+/*
+ * Append a given metadata record to the backing file and update a previous
+ * record to add the new record the the metadata list.
+ */
+static int append_md_to_backing_file(struct backing_file_context *bfc,
+ struct incfs_md_header *record)
+{
+ int result = 0;
+ loff_t record_offset;
+ loff_t file_pos;
+ __le64 new_md_offset;
+ size_t record_size;
+
+ if (!bfc || !record)
+ return -EFAULT;
+
+ if (bfc->bc_last_md_record_offset < 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ record_size = le16_to_cpu(record->h_record_size);
+ file_pos = incfs_get_end_offset(bfc->bc_file);
+ record->h_prev_md_offset = bfc->bc_last_md_record_offset;
+ record->h_next_md_offset = 0;
+ record->h_record_crc = cpu_to_le32(calc_md_crc(record));
+
+ /* Write the metadata record to the end of the backing file */
+ record_offset = file_pos;
+ new_md_offset = cpu_to_le64(record_offset);
+ result = write_to_bf(bfc, record, record_size, file_pos, true);
+ if (result)
+ return result;
+
+ /* Update next metadata offset in a previous record or a superblock. */
+ if (bfc->bc_last_md_record_offset) {
+ /*
+ * Find a place in the previous md record where new record's
+ * offset needs to be saved.
+ */
+ file_pos = bfc->bc_last_md_record_offset +
+ offsetof(struct incfs_md_header, h_next_md_offset);
+ } else {
+ /* No metadata yet, file a place to update in the superblock. */
+ file_pos = offsetof(struct incfs_super_block,
+ s_first_md_offset);
+ }
+ result = write_to_bf(bfc, &new_md_offset, sizeof(new_md_offset),
+ file_pos, true);
+ if (result)
+ return result;
+
+ bfc->bc_last_md_record_offset = record_offset;
+ return result;
+}
+
+/* Append incfs_inode metadata record to the backing file. */
+int incfs_write_inode_to_backing_file(struct backing_file_context *bfc, u64 ino,
+ u64 size, u16 mode)
+{
+ struct incfs_inode disk_inode = {};
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+ disk_inode.i_header.h_md_entry_type = INCFS_MD_INODE;
+ disk_inode.i_header.h_record_size = cpu_to_le16(sizeof(disk_inode));
+ disk_inode.i_header.h_next_md_offset = cpu_to_le64(0);
+ disk_inode.i_no = cpu_to_le64(ino);
+ disk_inode.i_size = cpu_to_le64(size);
+ disk_inode.i_mode = cpu_to_le16(mode);
+ disk_inode.i_flags = cpu_to_le32(0);
+
+ return append_md_to_backing_file(bfc, &disk_inode.i_header);
+}
+
+/* Append incfs_dir_action metadata record to the backing file. */
+int incfs_write_dir_action(struct backing_file_context *bfc, u64 dir_ino,
+ u64 dentry_ino, enum incfs_dir_action_type type,
+ struct mem_range name)
+{
+ struct incfs_dir_action action = {};
+ u8 name_len = min_t(u8, INCFS_MAX_NAME_LEN, name.len);
+
+ if (!bfc)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+ action.da_header.h_md_entry_type = INCFS_MD_DIR_ACTION;
+ action.da_header.h_record_size = cpu_to_le16(sizeof(action));
+ action.da_header.h_next_md_offset = cpu_to_le64(0);
+ action.da_dir_inode = cpu_to_le64(dir_ino);
+ action.da_entry_inode = cpu_to_le64(dentry_ino);
+ action.da_type = (__u8)type;
+ action.da_name_len = name_len;
+ memcpy(action.da_name, name.data, name_len);
+
+ return append_md_to_backing_file(bfc, &action.da_header);
+}
+
+/*
+ * Reserve 0-filled space for the blockmap body, and append
+ * incfs_blockmap metadata record pointing to it.
+ */
+int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
+ u64 ino, u32 block_count, loff_t *map_base_off)
+{
+ struct incfs_blockmap blockmap = {};
+ int result = 0;
+ loff_t file_end = 0;
+ size_t map_size = block_count * sizeof(struct incfs_blockmap_entry);
+
+ if (!bfc)
+ return -EFAULT;
+
+ blockmap.m_header.h_md_entry_type = INCFS_MD_BLOCK_MAP;
+ blockmap.m_header.h_record_size = cpu_to_le16(sizeof(blockmap));
+ blockmap.m_header.h_next_md_offset = cpu_to_le64(0);
+ blockmap.m_inode = cpu_to_le64(ino);
+ blockmap.m_block_count = cpu_to_le32(block_count);
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ /* Reserve 0-filled space for the blockmap body in the backing file. */
+ file_end = incfs_get_end_offset(bfc->bc_file);
+ result = append_zeros(bfc, map_size);
+ if (result)
+ return result;
+
+ /* Write blockmap metadata record pointing to the body written above. */
+ blockmap.m_base_offset = cpu_to_le64(file_end);
+ result = append_md_to_backing_file(bfc, &blockmap.m_header);
+ if (result) {
+ /* Error, rollback file changes */
+ truncate_backing_file(bfc, file_end);
+ } else if (map_base_off) {
+ *map_base_off = file_end;
+ }
+
+ return result;
+}
+
+/*
+ * Write a backing file header (superblock).
+ * It should always be called only on empty file.
+ * incfs_super_block.s_first_md_offset is 0 for now, but will be updated
+ * once first metadata record is added.
+ */
+int incfs_write_sb_to_backing_file(struct backing_file_context *bfc)
+{
+ struct incfs_super_block sb = {};
+ loff_t file_pos = 0;
+
+ if (!bfc)
+ return -EFAULT;
+
+ sb.s_magic = cpu_to_le64(INCFS_MAGIC_NUMBER);
+ sb.s_version = cpu_to_le64(INCFS_FORMAT_CURRENT_VER);
+ sb.s_super_block_size = cpu_to_le16(sizeof(sb));
+ sb.s_first_md_offset = cpu_to_le64(0);
+ sb.s_data_block_size = cpu_to_le16(INCFS_DATA_FILE_BLOCK_SIZE);
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ file_pos = incfs_get_end_offset(bfc->bc_file);
+ if (file_pos != 0)
+ return -EEXIST;
+
+ return write_to_bf(bfc, &sb, sizeof(sb), file_pos, true);
+}
+
+/* Write a given data block and update file's blockmap to point it. */
+int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block, int block_index,
+ loff_t bm_base_off, u16 flags, u32 crc)
+{
+ struct incfs_blockmap_entry bm_entry = {};
+ int result = 0;
+ loff_t data_offset = 0;
+ loff_t bm_entry_off =
+ bm_base_off + sizeof(struct incfs_blockmap_entry) * block_index;
+
+ if (!bfc)
+ return -EFAULT;
+
+ if (block.len >= (1 << 16) || block_index < 0)
+ return -EINVAL;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ data_offset = incfs_get_end_offset(bfc->bc_file);
+ if (data_offset <= bm_entry_off) {
+ /* Blockmap entry is beyond the file's end. It is not normal. */
+ return -EINVAL;
+ }
+
+ /* Write the block data at the end of the backing file. */
+ result = write_to_bf(bfc, block.data, block.len, data_offset, false);
+ if (result)
+ return result;
+
+ /* Update the blockmap to point to the newly written data. */
+ bm_entry.me_data_offset_lo = cpu_to_le32((u32)data_offset);
+ bm_entry.me_data_offset_hi = cpu_to_le16((u16)(data_offset >> 32));
+ bm_entry.me_data_size = cpu_to_le16((u16)block.len);
+ bm_entry.me_flags = cpu_to_le16(flags);
+ bm_entry.me_data_crc = cpu_to_le32(crc);
+
+ result = write_to_bf(bfc, &bm_entry, sizeof(bm_entry),
+ bm_entry_off, false);
+
+ return result;
+}
+
+/* Initialize a new image in a given backing file. */
+int incfs_make_empty_backing_file(struct backing_file_context *bfc)
+{
+ int result = 0;
+
+ if (!bfc || !bfc->bc_file)
+ return -EFAULT;
+
+ result = mutex_lock_interruptible(&bfc->bc_mutex);
+ if (result)
+ goto out;
+
+ result = truncate_backing_file(bfc, 0);
+ if (result)
+ goto out;
+
+ result = incfs_write_sb_to_backing_file(bfc);
+out:
+ mutex_unlock(&bfc->bc_mutex);
+ return result;
+}
+
+int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
+ loff_t bm_base_off,
+ struct incfs_blockmap_entry *bm_entry)
+{
+ loff_t bm_entry_off =
+ bm_base_off + sizeof(struct incfs_blockmap_entry) * block_index;
+ const size_t bytes_to_read = sizeof(struct incfs_blockmap_entry);
+ int result = 0;
+
+ if (!bfc || !bm_entry)
+ return -EFAULT;
+
+ if (block_index < 0 || bm_base_off <= 0)
+ return -ENODATA;
+
+ result = kernel_read(bfc->bc_file, bm_entry, bytes_to_read,
+ &bm_entry_off);
+ if (result < 0)
+ return result;
+ if (result < bytes_to_read)
+ return -EIO;
+ return 0;
+}
+
+int incfs_read_superblock(struct backing_file_context *bfc,
+ loff_t *first_md_off)
+{
+ loff_t pos = 0;
+ ssize_t bytes_read = 0;
+ struct incfs_super_block sb = {};
+
+ if (!bfc || !first_md_off)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+ bytes_read = kernel_read(bfc->bc_file, &sb, sizeof(sb), &pos);
+ if (bytes_read < 0)
+ return bytes_read;
+
+ if (bytes_read < sizeof(sb))
+ return -EBADMSG;
+
+ if (le64_to_cpu(sb.s_magic) != INCFS_MAGIC_NUMBER)
+ return -EILSEQ;
+
+ if (le64_to_cpu(sb.s_version) > INCFS_FORMAT_CURRENT_VER)
+ return -EILSEQ;
+
+ if (le16_to_cpu(sb.s_data_block_size) != INCFS_DATA_FILE_BLOCK_SIZE)
+ return -EILSEQ;
+
+ if (le16_to_cpu(sb.s_super_block_size) > sizeof(sb))
+ return -EILSEQ;
+
+ *first_md_off = le64_to_cpu(sb.s_first_md_offset);
+ return 0;
+}
+
+/*
+ * Read through metadata records from the backing file one by one
+ * and call provided metadata handlers.
+ */
+int incfs_read_next_metadata_record(struct backing_file_context *bfc,
+ struct metadata_handler *handler)
+{
+ loff_t pos = 0;
+ const ssize_t max_md_size = INCFS_MAX_METADATA_RECORD_SIZE;
+ ssize_t bytes_read = 0;
+ size_t md_record_size = 0;
+ loff_t next_record = 0;
+ loff_t prev_record = 0;
+ int res = 0;
+ struct incfs_md_header *md_hdr = NULL;
+
+ if (!bfc || !handler)
+ return -EFAULT;
+
+ LOCK_REQUIRED(bfc->bc_mutex);
+
+ if (handler->md_record_offset == 0)
+ return -EPERM;
+
+ memset(&handler->md_buffer, 0, max_md_size);
+ pos = handler->md_record_offset;
+ bytes_read = kernel_read(bfc->bc_file, (u8 *)&handler->md_buffer,
+ max_md_size, &pos);
+ if (bytes_read < 0)
+ return bytes_read;
+ if (bytes_read < sizeof(*md_hdr))
+ return -EBADMSG;
+
+ md_hdr = &handler->md_buffer.md_header;
+ next_record = le64_to_cpu(md_hdr->h_next_md_offset);
+ prev_record = le64_to_cpu(md_hdr->h_prev_md_offset);
+ md_record_size = le16_to_cpu(md_hdr->h_record_size);
+
+ if (md_record_size > max_md_size) {
+ pr_warn("incfs: The record is too large. Size: %ld",
+ md_record_size);
+ return -EBADMSG;
+ }
+
+ if (bytes_read < md_record_size) {
+ pr_warn("incfs: The record hasn't been fully read.");
+ return -EBADMSG;
+ }
+
+ if (next_record <= handler->md_record_offset && next_record != 0) {
+ pr_warn("incfs: Next record (%lld) points back in file.",
+ next_record);
+ return -EBADMSG;
+ }
+
+ if (prev_record != handler->md_prev_record_offset) {
+ pr_warn("incfs: Metadata chain has been corrupted.");
+ return -EBADMSG;
+ }
+
+ if (le32_to_cpu(md_hdr->h_record_crc) != calc_md_crc(md_hdr)) {
+ pr_warn("incfs: Metadata CRC mismatch.");
+ return -EBADMSG;
+ }
+
+ switch (md_hdr->h_md_entry_type) {
+ case INCFS_MD_NONE:
+ break;
+ case INCFS_MD_INODE:
+ if (handler->handle_inode)
+ res = handler->handle_inode(&handler->md_buffer.inode,
+ handler);
+ break;
+ case INCFS_MD_BLOCK_MAP:
+ if (handler->handle_blockmap)
+ res = handler->handle_blockmap(
+ &handler->md_buffer.blockmap, handler);
+ break;
+ case INCFS_MD_DIR_ACTION:
+ if (handler->handle_dir_action)
+ res = handler->handle_dir_action(
+ &handler->md_buffer.dir_action, handler);
+ break;
+ default:
+ res = -ENOTSUPP;
+ break;
+ }
+
+ if (!res) {
+ if (next_record == 0) {
+ /*
+ * Zero offset for the next record means that the last
+ * metadata record has just been processed.
+ */
+ bfc->bc_last_md_record_offset =
+ handler->md_record_offset;
+ }
+ handler->md_prev_record_offset = handler->md_record_offset;
+ handler->md_record_offset = next_record;
+ }
+ return res;
+}
diff --git a/fs/incfs/format.h b/fs/incfs/format.h
new file mode 100644
index 000000000000..2c2114bdd08f
--- /dev/null
+++ b/fs/incfs/format.h
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2018 Google LLC
+ */
+
+/*
+ * Overview
+ * --------
+ * The backbone of the incremental-fs ondisk format is an append only linked
+ * list of metadata blocks. Each metadata block contains an offset of the next
+ * one. These blocks describe files and directories on the
+ * file system. They also represent actions of adding and removing file names
+ * (hard links).
+ *
+ * Every time incremental-fs instance is mounted, it reads through this list
+ * to recreate filesystem's state in memory. An offset of the first record in
+ * the metadata list is stored in the superblock at the beginning of the backing
+ * file.
+ *
+ * Most of the backing file is taken by data areas and blockmaps.
+ * Since data blocks can be compressed and have different sizes,
+ * single per-file data area can't be pre-allocated. That's why blockmaps are
+ * needed in order to find a location and size of each data block in
+ * the backing file. Each time a file is created, a corresponding block map is
+ * allocated to store future offsets of data blocks.
+ *
+ * Whenever a data block is given by data loader to incremental-fs:
+ * - A data area with the given block is appended to the end of
+ * the backing file.
+ * - A record in the blockmap for the given block index is updated to reflect
+ * its location, size, and compression algorithm.
+
+ * Metadata records
+ * ----------------
+ * incfs_inode - metadata record to declare a file or a directory.
+ * incfs_inode.i_mode determents if it is a file
+ * or a directory.
+ * incfs_blockmap_entry - metadata record that specifies size and location
+ * of a blockmap area for a given file. This area
+ * contains an array of incfs_blockmap_entry-s.
+ * incfs_dir_action - metadata record that specifies changes made to a
+ * to a directory structure, e.g. add or remove a hardlink.
+ *
+ * Metadata header
+ * ---------------
+ * incfs_md_header - header of a metadata record. It's always a part
+ * of other structures and served purpose of metadata
+ * bookkeeping.
+ *
+ * +-----------------------------------------------+ ^
+ * | incfs_md_header | |
+ * | 1. type of body(INODE, BLOCKMAP, DIR ACTION..)| |
+ * | 2. size of the whole record header + body | |
+ * | 3. CRC the whole record header + body | |
+ * | 4. offset of the previous md record |]------+
+ * | 5. offset of the next md record (md link) |]---+
+ * +-----------------------------------------------+ |
+ * | Metadata record body with useful data | |
+ * +-----------------------------------------------+ |
+ * +--->
+ *
+ * Other ondisk structures
+ * -----------------------
+ * incfs_super_block - backing file header
+ * incfs_blockmap_entry - a record in a blockmap area that describes size
+ * and location of a data block.
+ * Data blocks dont have any particular structure, they are written to the
+ * backing file in a raw form as they come from a data loader.
+ *
+ * Backing file layout
+ * -------------------
+ *
+ *
+ * +-------------------------------------------+
+ * | incfs_super_block |]---+
+ * +-------------------------------------------+ |
+ * | metadata |<---+
+ * | incfs_inode |]---+
+ * +-------------------------------------------+ |
+ * ......................... |
+ * +-------------------------------------------+ | metadata
+ * +------->| blockmap area | | list links
+ * | | [incfs_blockmap_entry] | |
+ * | | [incfs_blockmap_entry] | |
+ * | | [incfs_blockmap_entry] | |
+ * | +--[| [incfs_blockmap_entry] | |
+ * | | | [incfs_blockmap_entry] | |
+ * | | | [incfs_blockmap_entry] | |
+ * | | +-------------------------------------------+ |
+ * | | ......................... |
+ * | | +-------------------------------------------+ |
+ * | | | metadata |<---+
+ * +----|--[| incfs_blockmap |]---+
+ * | +-------------------------------------------+ |
+ * | ......................... |
+ * | +-------------------------------------------+ |
+ * +-->| data block | |
+ * +-------------------------------------------+ |
+ * ......................... |
+ * +-------------------------------------------+ |
+ * | metadata |<---+
+ * | incfs_dir_action |
+ * +-------------------------------------------+
+ */
+#ifndef _INCFS_FORMAT_H
+#define _INCFS_FORMAT_H
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <uapi/linux/incrementalfs.h>
+
+#include "internal.h"
+
+#define INCFS_MAX_NAME_LEN 255
+#define INCFS_FORMAT_V1 1
+#define INCFS_FORMAT_CURRENT_VER INCFS_FORMAT_V1
+
+enum incfs_metadata_type {
+ INCFS_MD_NONE = 0,
+ INCFS_MD_INODE = 1,
+ INCFS_MD_BLOCK_MAP = 2,
+ INCFS_MD_DIR_ACTION = 3
+};
+
+/* Header included at the beginning of all metadata records on the disk. */
+struct incfs_md_header {
+ __u8 h_md_entry_type;
+
+ /*
+ * Size of the metadata record.
+ * (e.g. inode, dir entry etc) not just this struct.
+ */
+ __le16 h_record_size;
+
+ /*
+ * CRC32 of the metadata record.
+ * (e.g. inode, dir entry etc) not just this struct.
+ */
+ __le32 h_record_crc;
+
+ /* Offset of the next metadata entry if any */
+ __le64 h_next_md_offset;
+
+ /* Offset of the previous metadata entry if any */
+ __le64 h_prev_md_offset;
+
+} __packed;
+
+/* Backing file header */
+struct incfs_super_block {
+ __le64 s_magic; /* Magic signature: INCFS_MAGIC_NUMBER */
+ __le64 s_version; /* Format version: INCFS_FORMAT_CURRENT_VER */
+ __le16 s_super_block_size; /* sizeof(incfs_super_block) */
+ __le32 s_flags; /* Reserved for future use. */
+ __le64 s_first_md_offset; /* Offset of the first metadata record */
+ __le16 s_data_block_size; /* INCFS_DATA_FILE_BLOCK_SIZE */
+} __packed;
+
+/* Metadata record for files and directories. Type = INCFS_MD_INODE */
+struct incfs_inode {
+ struct incfs_md_header i_header;
+ __le64 i_no; /* inode number */
+ __le64 i_size; /* Full size of the file's content */
+ __le16 i_mode; /* File mode */
+ __le32 i_flags; /* Reserved for future use. */
+} __packed;
+
+enum incfs_block_map_entry_flags {
+ INCFS_BLOCK_COMPRESSED_LZ4 = (1 << 0),
+};
+
+/* Block map entry pointing to an actual location of the data block. */
+struct incfs_blockmap_entry {
+ /* Offset of the actual data block. Lower 32 bits */
+ __le32 me_data_offset_lo;
+
+ /* Offset of the actual data block. Higher 16 bits */
+ __le16 me_data_offset_hi;
+
+ /* How many bytes the data actually occupies in the backing file */
+ __le16 me_data_size;
+
+ /* Block flags from incfs_block_map_entry_flags */
+ __u16 me_flags;
+
+ /* CRC32 of the block's data */
+ __le32 me_data_crc;
+} __packed;
+
+/* Metadata record for locations of file blocks. Type = INCFS_MD_BLOCK_MAP */
+struct incfs_blockmap {
+ struct incfs_md_header m_header;
+ /* inode of a file this map belongs to */
+ __le64 m_inode;
+
+ /* Base offset of the array of incfs_blockmap_entry */
+ __le64 m_base_offset;
+
+ /* Size of the map entry array in blocks */
+ __le32 m_block_count;
+} __packed;
+
+enum incfs_dir_action_type {
+ INCFS_DIRA_NONE = 0,
+ INCFS_DIRA_ADD_ENTRY = 1,
+ INCFS_DIRA_REMOVE_ENTRY = 2,
+};
+
+/* Metadata record of directory content change. Type = INCFS_MD_DIR_ACTION */
+struct incfs_dir_action {
+ struct incfs_md_header da_header;
+ __le64 da_dir_inode; /* Parent directory inode number */
+ __le64 da_entry_inode; /* File/subdirectory inode number */
+ __u8 da_type; /* One of enums incfs_dir_action_type */
+ __u8 da_name_len; /* Name length */
+ char da_name[INCFS_MAX_NAME_LEN]; /* File name */
+} __packed;
+
+/* State of the backing file. */
+struct backing_file_context {
+ /* Protects writes to bc_file */
+ struct mutex bc_mutex;
+
+ /* File object to read data from */
+ struct file *bc_file;
+
+ /*
+ * Offset of the last known metadata record in the backing file.
+ * 0 means there are no metadata records.
+ */
+ loff_t bc_last_md_record_offset;
+};
+
+struct metadata_handler {
+ loff_t md_record_offset;
+ loff_t md_prev_record_offset;
+ void *context;
+
+ union {
+ struct incfs_md_header md_header;
+ struct incfs_inode inode;
+ struct incfs_blockmap blockmap;
+ struct incfs_dir_action dir_action;
+ } md_buffer;
+
+ int (*handle_inode)(struct incfs_inode *inode,
+ struct metadata_handler *handler);
+ int (*handle_blockmap)(struct incfs_blockmap *bm,
+ struct metadata_handler *handler);
+ int (*handle_dir_action)(struct incfs_dir_action *da,
+ struct metadata_handler *handler);
+};
+#define INCFS_MAX_METADATA_RECORD_SIZE \
+ FIELD_SIZEOF(struct metadata_handler, md_buffer)
+
+loff_t incfs_get_end_offset(struct file *f);
+
+/* Backing file context management */
+struct backing_file_context *incfs_alloc_bfc(struct file *backing_file);
+
+void incfs_free_bfc(struct backing_file_context *bfc);
+
+/* Writing stuff */
+int incfs_write_inode_to_backing_file(struct backing_file_context *bfc, u64 ino,
+ u64 size, u16 mode);
+
+int incfs_write_dir_action(struct backing_file_context *bfc, u64 dir_ino,
+ u64 dentry_ino, enum incfs_dir_action_type type,
+ struct mem_range name);
+
+int incfs_write_blockmap_to_backing_file(struct backing_file_context *bfc,
+ u64 ino, u32 block_count,
+ loff_t *map_base_off);
+
+int incfs_write_sb_to_backing_file(struct backing_file_context *bfc);
+
+int incfs_write_data_block_to_backing_file(struct backing_file_context *bfc,
+ struct mem_range block,
+ int block_index, loff_t bm_base_off,
+ u16 flags, u32 crc);
+
+int incfs_make_empty_backing_file(struct backing_file_context *bfc);
+
+/* Reading stuff */
+int incfs_read_superblock(struct backing_file_context *bfc,
+ loff_t *first_md_off);
+
+int incfs_read_blockmap_entry(struct backing_file_context *bfc, int block_index,
+ loff_t bm_base_off,
+ struct incfs_blockmap_entry *bm_entry);
+
+int incfs_read_next_metadata_record(struct backing_file_context *bfc,
+ struct metadata_handler *handler);
+
+#endif /* _INCFS_FORMAT_H */
diff --git a/fs/incfs/internal.h b/fs/incfs/internal.h
new file mode 100644
index 000000000000..de8b6240e347
--- /dev/null
+++ b/fs/incfs/internal.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2018 Google LLC
+ */
+#ifndef _INCFS_INTERNAL_H
+#define _INCFS_INTERNAL_H
+#include <linux/types.h>
+
+struct mem_range {
+ u8 *data;
+ size_t len;
+};
+
+static inline struct mem_range range(u8 *data, size_t len)
+{
+ return (struct mem_range){ .data = data, .len = len };
+}
+
+#ifdef DEBUG
+#define LOCK_REQUIRED(lock) \
+ do { \
+ if (!mutex_is_locked(&(lock))) { \
+ pr_err(#lock " must be taken"); \
+ panic("Lock not taken."); \
+ } \
+ } while (0)
+#else
+#define LOCK_REQUIRED(lock)
+#endif
+
+#endif /* _INCFS_INTERNAL_H */
--
2.21.0.593.g511ec345e18-goog
next prev parent reply other threads:[~2019-05-02 4:03 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-05-02 4:03 Initial patches for Incremental FS ezemtsov
2019-05-02 4:03 ` [PATCH 1/6] incfs: Add first files of incrementalfs ezemtsov
2019-05-02 19:06 ` Miklos Szeredi
2019-05-02 20:41 ` Randy Dunlap
2019-05-07 15:57 ` Jann Horn
2019-05-07 17:13 ` Greg KH
2019-05-07 17:18 ` Greg KH
2019-05-02 4:03 ` ezemtsov [this message]
2019-05-02 4:03 ` [PATCH 3/6] incfs: Management of in-memory FS data structures ezemtsov
2019-05-02 4:03 ` [PATCH 4/6] incfs: Integration with VFS layer ezemtsov
2019-05-02 4:03 ` [PATCH 6/6] incfs: Integration tests for incremental-fs ezemtsov
2019-05-02 11:19 ` Initial patches for Incremental FS Amir Goldstein
2019-05-02 13:10 ` Theodore Ts'o
2019-05-02 13:26 ` Al Viro
2019-05-03 4:23 ` Eugene Zemtsov
2019-05-03 5:19 ` Amir Goldstein
2019-05-08 20:09 ` Eugene Zemtsov
2019-05-09 8:15 ` Amir Goldstein
[not found] ` <CAK8JDrEQnXTcCtAPkb+S4r4hORiKh_yX=0A0A=LYSVKUo_n4OA@mail.gmail.com>
2019-05-21 1:32 ` Yurii Zubrytskyi
2019-05-22 8:32 ` Miklos Szeredi
2019-05-22 17:25 ` Yurii Zubrytskyi
2019-05-23 4:25 ` Miklos Szeredi
2019-05-29 21:06 ` Yurii Zubrytskyi
2019-05-30 9:22 ` Miklos Szeredi
2019-05-30 22:45 ` Yurii Zubrytskyi
2019-05-31 9:02 ` Miklos Szeredi
2019-05-22 10:54 ` Amir Goldstein
2019-05-03 7:23 ` Richard Weinberger
2019-05-03 10:22 ` Miklos Szeredi
2019-05-02 13:46 ` Amir Goldstein
2019-05-02 18:16 ` Richard Weinberger
2019-05-02 18:33 ` Richard Weinberger
2019-05-02 13:47 ` J. R. Okajima
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190502040331.81196-3-ezemtsov@google.com \
--to=ezemtsov@google.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).