From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mx143.netapp.com ([216.240.21.24]:26881 "EHLO mx143.netapp.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751362AbeCMRkk (ORCPT ); Tue, 13 Mar 2018 13:40:40 -0400 Subject: [RFC 6/7] zuf: Filesystem operations To: linux-fsdevel References: <8a4e01bd-0014-1c44-e89f-9d70ccbe0658@netapp.com> CC: Ric Wheeler , Miklos Szeredi , Steve French , Steven Whitehouse , Jefff moyer , Sage Weil , Jan Kara , Amir Goldstein , Andy Rudof , Anna Schumaker , Amit Golander , Sagi Manole , Shachar Sharon From: Boaz Harrosh Message-ID: Date: Tue, 13 Mar 2018 19:39:50 +0200 MIME-Version: 1.0 In-Reply-To: <8a4e01bd-0014-1c44-e89f-9d70ccbe0658@netapp.com> Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit Sender: linux-fsdevel-owner@vger.kernel.org List-ID: The principle for all operations is the same. * The few parameters are given on the despatch IOCTL buffer (up to 4k of parameters) * Any application buffers or other big buffers like readdir are mapped via zuf-core to the the Server VM * The operation is despatched. Return code and few out parameter are returned in the despatch-return buffer. Any data stored/read at mapped application buffers. * zus objects like zus_inode symlinks and so on are returned through a dpp_t (Dual port pointer) - a special kind of zuf construct that enables to have a Kernel pointer and a server pointer to the same memory. If pmem is used this is usually a pointer to pmem. The Kernel's zuf part may even write to this returned pointer but it will then send a despatch, for the FS to persist the change. TODO: This patch is probably too big. how best to split it? Signed-off-by: Boaz Harrosh --- fs/zuf/Makefile | 2 +- fs/zuf/_extern.h | 48 +++++ fs/zuf/directory.c | 156 ++++++++++++++ fs/zuf/file.c | 403 ++++++++++++++++++++++++++++++++++ fs/zuf/inode.c | 617 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/zuf/namei.c | 421 ++++++++++++++++++++++++++++++++++++ fs/zuf/symlink.c | 76 +++++++ fs/zuf/zus_api.h | 234 ++++++++++++++++++++ 8 files changed, 1953 insertions(+), 4 deletions(-) create mode 100644 fs/zuf/directory.c create mode 100644 fs/zuf/file.c create mode 100644 fs/zuf/namei.c create mode 100644 fs/zuf/symlink.c diff --git a/fs/zuf/Makefile b/fs/zuf/Makefile index 94ce80b..4c125f7 100644 --- a/fs/zuf/Makefile +++ b/fs/zuf/Makefile @@ -17,5 +17,5 @@ zuf-y += md.o t2.o t1.o zuf-y += zuf-core.o zuf-root.o # Main FS -zuf-y += super.o inode.o +zuf-y += super.o inode.o directory.o file.o namei.o symlink.o zuf-y += module.o diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h index 0543fd8..cf2e80f 100644 --- a/fs/zuf/_extern.h +++ b/fs/zuf/_extern.h @@ -19,16 +19,43 @@ * extern functions declarations */ +/* directory.c */ +int zuf_add_dentry(struct inode *dir, struct qstr *str, + struct inode *inode, bool rename); +int zuf_remove_dentry(struct inode *dir, struct qstr *str); + /* inode.c */ +int zuf_evict_dispatch(struct super_block *sb, struct zus_inode_info *zus_ii, + int operation); struct inode *zuf_iget(struct super_block *sb, struct zus_inode_info *zus_ii, zu_dpp_t _zi, bool *exist); void zuf_evict_inode(struct inode *inode); +struct inode *zuf_new_inode(struct inode *dir, umode_t mode, + const struct qstr *qstr, const char *symname, + ulong rdev_or_isize, bool tmpfile); int zuf_write_inode(struct inode *inode, struct writeback_control *wbc); +int zuf_update_time(struct inode *inode, struct timespec *time, int flags); +int zuf_setattr(struct dentry *dentry, struct iattr *attr); +int zuf_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); +void zuf_set_inode_flags(struct inode *inode, struct zus_inode *zi); +bool zuf_dir_emit(struct super_block *sb, struct dir_context *ctx, + ulong ino, const char *name, int length); + +/* symlink.c */ +uint zuf_prepare_symname(struct zufs_ioc_new_inode *ioc_new_inode, + const char *symname, ulong len, struct page *pages[2]); + +/* file.c */ int zuf_isync(struct inode *inode, loff_t start, loff_t end, int datasync); /* super.c */ int zuf_init_inodecache(void); void zuf_destroy_inodecache(void); + +void zuf_sync_inc(struct inode *inode); +void zuf_sync_dec(struct inode *inode, ulong write_unmapped); + struct dentry *zuf_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); @@ -56,4 +83,25 @@ int zuf_register_fs(struct super_block *sb, struct zufs_ioc_register_fs *rfs); /* t1.c */ int zuf_pmem_mmap(struct file *file, struct vm_area_struct *vma); +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations zuf_dir_operations; + +/* file.c */ +extern const struct inode_operations zuf_file_inode_operations; +extern const struct file_operations zuf_file_operations; + +/* inode.c */ +extern const struct address_space_operations zuf_aops; + +/* namei.c */ +extern const struct inode_operations zuf_dir_inode_operations; +extern const struct inode_operations zuf_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations zuf_symlink_inode_operations; + #endif /*ndef __ZUF_EXTERN_H__*/ diff --git a/fs/zuf/directory.c b/fs/zuf/directory.c new file mode 100644 index 0000000..f8f68b8 --- /dev/null +++ b/fs/zuf/directory.c @@ -0,0 +1,156 @@ +/* + * BRIEF DESCRIPTION + * + * File operations for directories. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh + * Sagi Manole " + */ + +#include +#include +#include "zuf.h" + +static int zuf_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + loff_t i_size = i_size_read(inode); + struct zufs_ioc_readdir ioc_readdir = { + .hdr.in_len = sizeof(ioc_readdir), + .hdr.out_len = sizeof(ioc_readdir), + .hdr.operation = ZUS_OP_READDIR, + .dir_ii = ZUII(inode)->zus_ii, + }; + struct zufs_readdir_iter rdi; + struct page *pages[ZUS_API_MAP_MAX_PAGES]; + struct zufs_dir_entry *zde; + void *addr, *__a; + uint nump, i; + int err; + + if (ctx->pos && i_size <= ctx->pos) + return 0; + if (!i_size) + i_size = PAGE_SIZE; /* Just for the . && .. */ + + ioc_readdir.hdr.len = min_t(loff_t, i_size - ctx->pos, + ZUS_API_MAP_MAX_SIZE); + nump = md_o2p_up(ioc_readdir.hdr.len); + addr = vzalloc(md_p2o(nump)); + if (unlikely(!addr)) + return -ENOMEM; + + WARN_ON((ulong)addr & (PAGE_SIZE - 1)); + + __a = addr; + for (i = 0; i < nump; ++i) { + pages[i] = vmalloc_to_page(__a); + __a += PAGE_SIZE; + } + +more: + ioc_readdir.pos = ctx->pos; + + err = zufs_dispatch(ZUF_ROOT(SBI(sb)), &ioc_readdir.hdr, pages, nump); + if (unlikely(err)) { + zuf_err("zufs_dispatch failed => %d\n", err); + goto out; + } + + zufs_readdir_iter_init(&rdi, &ioc_readdir, addr); + while ((zde = zufs_next_zde(&rdi)) != NULL) { + zuf_dbg_verbose("%s pos=0x%lx\n", + zde->zstr.name, (ulong)zde->pos); + ctx->pos = zde->pos; + if (!dir_emit(ctx, zde->zstr.name, zde->zstr.len, zde->ino, + zde->type)) + goto out; + } + ctx->pos = ioc_readdir.pos; + if (ioc_readdir.more) { + zuf_dbg_err("more\n"); + goto more; + } +out: + vfree(addr); + return err; +} + +/* + *FIXME comment to full git diff + */ + +static int _dentry_dispatch(struct inode *dir, struct inode *inode, + struct qstr *str, int operation) +{ + struct zufs_ioc_dentry ioc_dentry = { + .hdr.operation = operation, + .hdr.in_len = sizeof(ioc_dentry), + .hdr.out_len = sizeof(ioc_dentry), + .zus_ii = inode ? ZUII(inode)->zus_ii : NULL, + .zus_dir_ii = ZUII(dir)->zus_ii, + .str.len = str->len, + }; + int err; + + memcpy(&ioc_dentry.str.name, str->name, str->len); + + err = zufs_dispatch(ZUF_ROOT(SBI(dir->i_sb)), &ioc_dentry.hdr, NULL, 0); + if (unlikely(err)) { + zuf_err("op=%d zufs_dispatch failed => %d\n", operation, err); + return err; + } + + return 0; +} + +/* return pointer to added de on success, err-code on failure */ +int zuf_add_dentry(struct inode *dir, struct qstr *str, struct inode *inode, + bool rename) +{ + struct zuf_inode_info *zii = ZUII(dir); + int err; + + if (!str->len || !zii->zi) + return -EINVAL; + + zus_inode_cmtime_now(dir, zii->zi); + err = _dentry_dispatch(dir, inode, str, ZUS_OP_ADD_DENTRY); + if (unlikely(err)) { + zuf_err("_dentry_dispatch failed => %d\n", err); + return err; + } + i_size_write(dir, le64_to_cpu(zii->zi->i_size)); + + return 0; +} + +int zuf_remove_dentry(struct inode *dir, struct qstr *str) +{ + struct zuf_inode_info *zii = ZUII(dir); + int err; + + if (!str->len) + return -EINVAL; + + zus_inode_cmtime_now(dir, zii->zi); + err = _dentry_dispatch(dir, NULL, str, ZUS_OP_REMOVE_DENTRY); + if (unlikely(err)) + return err; + + i_size_write(dir, le64_to_cpu(zii->zi->i_size)); + return 0; +} + +const struct file_operations zuf_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = zuf_readdir, + .fsync = noop_fsync, +}; diff --git a/fs/zuf/file.c b/fs/zuf/file.c new file mode 100644 index 0000000..3b37d9f --- /dev/null +++ b/fs/zuf/file.c @@ -0,0 +1,403 @@ +/* + * BRIEF DESCRIPTION + * + * File operations for files. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh + * Sagi Manole " + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zuf.h" + +static long zuf_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct zuf_inode_info *zii = ZUII(inode); + struct zufs_ioc_range ioc_range = { + .hdr.in_len = sizeof(ioc_range), + .hdr.operation = ZUS_OP_FALLOCATE, + .zus_ii = ZUII(inode)->zus_ii, + .offset = offset, + .length = len, + .opflags = mode, + }; + int err; + + zuf_dbg_vfs("[%ld] mode=0x%x offset=0x%llx len=0x%llx\n", + inode->i_ino, mode, offset, len); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + zuf_w_lock(zii); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (i_size_read(inode) < offset + len)) { + err = inode_newsize_ok(inode, offset + len); + if (unlikely(err)) + goto out; + } + + zus_inode_cmtime_now(inode, zii->zi); + + err = zufs_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_range.hdr, + NULL, 0); + if (unlikely(err)) + zuf_err("zufs_dispatch failed => %d\n", err); + + i_size_write(inode, le64_to_cpu(zii->zi->i_size)); + inode->i_blocks = le64_to_cpu(zii->zi->i_blocks); + +out: + zuf_w_unlock(zii); + + return err; +} + +static loff_t zuf_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct zuf_inode_info *zii = ZUII(inode); + struct zufs_ioc_seek ioc_seek = { + .hdr.in_len = sizeof(ioc_seek), + .hdr.out_len = sizeof(ioc_seek), + .hdr.operation = ZUS_OP_LLSEEK, + .zus_ii = zii->zus_ii, + .offset_in = offset, + .whence = whence, + }; + int err = 0; + + zuf_dbg_vfs("[%ld] offset=0x%llx whence=%d\n", + inode->i_ino, offset, whence); + + if (whence != SEEK_DATA && whence != SEEK_HOLE) + return generic_file_llseek(file, offset, whence); + + zuf_r_lock(zii); + + if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) || + offset > inode->i_sb->s_maxbytes) { + err = -EINVAL; + goto out; + } else if (inode->i_size <= offset) { + err = -ENXIO; + goto out; + } else if (!inode->i_blocks) { + if (whence == SEEK_HOLE) + ioc_seek.offset_out = i_size_read(inode); + else + err = -ENXIO; + goto out; + } + + err = zufs_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_seek.hdr, NULL, 0); + if (unlikely(err)) { + zuf_err("zufs_dispatch failed => %d\n", err); + goto out; + } + + if (ioc_seek.offset_out != file->f_pos) { + file->f_pos = ioc_seek.offset_out; + file->f_version = 0; + } + +out: + zuf_r_unlock(zii); + + return err ?: ioc_seek.offset_out; +} + +/* This function is called by both msync() and fsync(). */ +int zuf_isync(struct inode *inode, loff_t start, loff_t end, int datasync) +{ + struct zuf_inode_info *zii = ZUII(inode); + struct zufs_ioc_range ioc_range = { + .hdr.in_len = sizeof(ioc_range), + .hdr.operation = ZUS_OP_SYNC, + .zus_ii = zii->zus_ii, + .offset = start, + .opflags = datasync, + }; + loff_t isize; + ulong uend = end + 1; + int err = 0; + + zuf_dbg_vfs( + "[%ld] start=0x%llx end=0x%llx datasync=%d write_mapped=%d\n", + inode->i_ino, start, end, datasync, + atomic_read(&zii->write_mapped)); + + /* We want to serialize the syncs so they don't fight with each other + * and is though more efficient, but we do not want to lock out + * read/writes and page-faults so we have a special sync semaphore + */ + zuf_smw_lock(zii); + + isize = i_size_read(inode); + if (!isize) { + zuf_dbg_mmap("[%ld] file is empty\n", inode->i_ino); + goto out; + } + if (isize < uend) + uend = isize; + if (uend < start) { + zuf_dbg_mmap("[%ld] isize=0x%llx start=0x%llx end=0x%lx\n", + inode->i_ino, isize, start, uend); + err = -ENODATA; + goto out; + } + + if (!atomic_read(&zii->write_mapped)) + goto out; /* Nothing to do on this inode */ + + ioc_range.length = uend - start; + unmap_mapping_range(inode->i_mapping, start, ioc_range.length, 0); + + err = zufs_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_range.hdr, + NULL, 0); + if (unlikely(err)) + zuf_err("zufs_dispatch failed => %d\n", err); + + zuf_sync_dec(inode, ioc_range.write_unmapped); + +out: + zuf_smw_unlock(zii); + return err; +} + +static int zuf_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + return zuf_isync(file_inode(file), start, end, datasync); +} + +/* This callback is called when a file is closed */ +static int zuf_flush(struct file *file, fl_owner_t id) +{ + zuf_dbg_vfs("[%ld]\n", file->f_inode->i_ino); + + return 0; +} + +static int tozu_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 offset, u64 len) +{ + int err = 0; + ulong start_index = md_o2p(offset); + ulong end_index = md_o2p_up(offset + len); + struct zuf_inode_info *zii = ZUII(inode); + + zuf_dbg_vfs( + "[%ld] offset=0x%llx len=0x%llx i-start=0x%lx i-end=0x%lx\n", + inode->i_ino, offset, len, start_index, end_index); + + if (fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)) + return -EBADR; + + zuf_r_lock(zii); + + /* TODO: ZUS fiemap (&msi)*/ + + zuf_r_unlock(zii); + return err; +} + +static void _lock_two_ziis(struct zuf_inode_info *zii1, + struct zuf_inode_info *zii2) +{ + if (zii1 > zii2) + swap(zii2, zii2); + + zuf_w_lock(zii1); + if (zii1 != zii2) + zuf_w_lock_nested(zii2); +} + +static void _unlock_two_ziis(struct zuf_inode_info *zii1, + struct zuf_inode_info *zii2) +{ + if (zii1 > zii2) + swap(zii2, zii2); + + if (zii1 != zii2) + zuf_w_unlock(zii2); + zuf_w_unlock(zii1); +} + +static int _clone_file_range(struct inode *src_inode, loff_t pos_in, + struct inode *dst_inode, loff_t pos_out, + u64 len, int operation) +{ + struct zuf_inode_info *src_zii = ZUII(src_inode); + struct zuf_inode_info *dst_zii = ZUII(dst_inode); + struct zus_inode *dst_zi = dst_zii->zi; + struct super_block *sb = src_inode->i_sb; + struct zufs_ioc_clone ioc_clone = { + .hdr.in_len = sizeof(ioc_clone), + .hdr.out_len = sizeof(ioc_clone), + .hdr.operation = operation, + .src_zus_ii = src_zii->zus_ii, + .dst_zus_ii = dst_zii->zus_ii, + .pos_in = pos_in, + .pos_out = pos_out, + .len = len, + }; + int err; + + _lock_two_ziis(src_zii, dst_zii); + + /* NOTE: len==0 means to-end-of-file which is what we want */ + unmap_mapping_range(src_inode->i_mapping, pos_in, len, 0); + unmap_mapping_range(dst_inode->i_mapping, pos_out, len, 0); + + zus_inode_cmtime_now(dst_inode, dst_zi); + err = zufs_dispatch(ZUF_ROOT(SBI(sb)), &ioc_clone.hdr, NULL, 0); + if (unlikely(err)) { + zuf_err("failed to clone %ld -> %ld ; err=%d\n", + src_inode->i_ino, dst_inode->i_ino, err); + goto out; + } + + dst_inode->i_blocks = le64_to_cpu(dst_zi->i_blocks); + i_size_write(dst_inode, dst_zi->i_size); + +out: + _unlock_two_ziis(src_zii, dst_zii); + + return err; +} + +static int zuf_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + struct inode *src_inode = file_inode(file_in); + struct inode *dst_inode = file_inode(file_out); + ulong src_size = i_size_read(src_inode); + ulong dst_size = i_size_read(dst_inode); + struct super_block *sb = src_inode->i_sb; + ulong len_up = len; + int err; + + zuf_dbg_vfs( + "ino-in=%ld ino-out=%ld pos_in=0x%llx pos_out=0x%llx length=0x%llx\n", + src_inode->i_ino, dst_inode->i_ino, pos_in, pos_out, len); + + if (src_inode == dst_inode) { + if (pos_in == pos_out) { + zuf_dbg_err("[%ld] Clone nothing!!\n", + src_inode->i_ino); + return 0; + } + if (pos_in < pos_out) { + if (pos_in + len > pos_out) { + zuf_dbg_err( + "[%ld] overlapping pos_in < pos_out?? => EINVAL\n", + src_inode->i_ino); + return -EINVAL; + } + } else { + if (pos_out + len > pos_in) { + zuf_dbg_err("[%ld] overlapping pos_out < pos_in?? => EINVAL\n", + src_inode->i_ino); + return -EINVAL; + } + } + } + + if ((pos_in & (sb->s_blocksize - 1)) || + (pos_out & (sb->s_blocksize - 1))) { + zuf_err("[%ld] Not aligned len=0x%llx pos_in=0x%llx " + "pos_out=0x%llx src-size=0x%llx dst-size=0x%llx\n", + src_inode->i_ino, len, pos_in, pos_out, + i_size_read(src_inode), i_size_read(dst_inode)); + return -EINVAL; + } + + /* STD says that len==0 means up to end of SRC */ + if (!len) + len_up = len = src_size - pos_in; + + if (!pos_in && !pos_out && (src_size <= pos_in + len) && + (dst_size <= src_size)) { + len_up = 0; + } else if (len & (sb->s_blocksize - 1)) { + /* un-aligned len, see if it is beyond EOF */ + if ((src_size > pos_in + len) || + (dst_size > pos_out + len)) { + zuf_err("[%ld] Not aligned len=0x%llx pos_in=0x%llx " + "pos_out=0x%llx src-size=0x%lx dst-size=0x%lx\n", + src_inode->i_ino, len, pos_in, pos_out, + src_size, dst_size); + return -EINVAL; + } + len_up = md_p2o(md_o2p_up(len)); + } + + err = _clone_file_range(src_inode, pos_in, dst_inode, pos_out, len_up, + ZUS_OP_CLONE); + if (unlikely(err)) + zuf_err("_clone_file_range failed => %d\n", err); + + return err; +} + +static ssize_t zuf_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, uint flags) +{ + struct inode *src_inode = file_inode(file_in); + struct inode *dst_inode = file_inode(file_out); + ssize_t ret; + + zuf_dbg_vfs("ino-in=%ld ino-out=%ld pos_in=0x%llx pos_out=0x%llx length=0x%lx\n", + src_inode->i_ino, dst_inode->i_ino, pos_in, pos_out, len); + + ret = _clone_file_range(src_inode, pos_in, dst_inode, pos_out, len, + ZUS_OP_COPY); + + return ret ?: len; +} + +/* ZUFS: + * make sure we clean up the resources consumed by zufs_init() + */ +static int zuf_file_release(struct inode *inode, struct file *filp) +{ + if (unlikely(filp->private_data)) + zuf_err("not yet\n"); + + return 0; +} + +const struct file_operations zuf_file_operations = { + .llseek = zuf_llseek, + .open = generic_file_open, + .fsync = zuf_fsync, + .flush = zuf_flush, + .release = zuf_file_release, + .fallocate = zuf_fallocate, + .copy_file_range = zuf_copy_file_range, + .clone_file_range = zuf_clone_file_range, +}; + +const struct inode_operations zuf_file_inode_operations = { + .setattr = zuf_setattr, + .getattr = zuf_getattr, + .update_time = zuf_update_time, + .fiemap = tozu_fiemap, +}; diff --git a/fs/zuf/inode.c b/fs/zuf/inode.c index 7aa8c9e..1129aae 100644 --- a/fs/zuf/inode.c +++ b/fs/zuf/inode.c @@ -12,16 +12,424 @@ * Sagi Manole " */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "zuf.h" +/* Flags that should be inherited by new inodes from their parent. */ +#define ZUFS_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \ + FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \ + FS_COMPRBLK_FL | FS_NOCOMP_FL | \ + FS_JOURNAL_DATA_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL) +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define ZUFS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +/* Flags that are appropriate for non-directories/regular files. */ +#define ZUFS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + + +static bool _zi_valid(struct zus_inode *zi) +{ + if (!_zi_active(zi)) + return false; + + switch (le16_to_cpu(zi->i_mode) & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + return true; + default: + zuf_err("unknown file type ino=%lld mode=%d\n", zi->i_ino, + zi->i_mode); + return false; + } +} + +static void _set_inode_from_zi(struct inode *inode, struct zus_inode *zi) +{ + inode->i_mode = le16_to_cpu(zi->i_mode); + inode->i_uid = KUIDT_INIT(le32_to_cpu(zi->i_uid)); + inode->i_gid = KGIDT_INIT(le32_to_cpu(zi->i_gid)); + set_nlink(inode, le16_to_cpu(zi->i_nlink)); + inode->i_size = le64_to_cpu(zi->i_size); + inode->i_size = le64_to_cpu(zi->i_blocks); + mt_to_timespec(&inode->i_atime, &zi->i_atime); + mt_to_timespec(&inode->i_ctime, &zi->i_ctime); + mt_to_timespec(&inode->i_mtime, &zi->i_mtime); + inode->i_generation = le64_to_cpu(zi->i_generation); + zuf_set_inode_flags(inode, zi); + + inode->i_blocks = le64_to_cpu(zi->i_blocks); + inode->i_mapping->a_ops = &zuf_aops; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &zuf_file_inode_operations; + inode->i_fop = &zuf_file_operations; + break; + case S_IFDIR: + inode->i_op = &zuf_dir_inode_operations; + inode->i_fop = &zuf_dir_operations; + break; + case S_IFLNK: + inode->i_op = &zuf_symlink_inode_operations; + break; + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + inode->i_size = 0; + inode->i_op = &zuf_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(zi->i_rdev)); + break; + default: + zuf_err("unknown file type ino=%lld mode=%d\n", zi->i_ino, + zi->i_mode); + break; + } + + inode->i_ino = le64_to_cpu(zi->i_ino); +} + +static void tozu_get_inode_flags(struct inode *inode, struct zus_inode *zi) +{ + unsigned int flags = inode->i_flags; + unsigned int tozu_flags = le32_to_cpu(zi->i_flags); + + tozu_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL | + FS_NOATIME_FL | FS_DIRSYNC_FL); + if (flags & S_SYNC) + tozu_flags |= FS_SYNC_FL; + if (flags & S_APPEND) + tozu_flags |= FS_APPEND_FL; + if (flags & S_IMMUTABLE) + tozu_flags |= FS_IMMUTABLE_FL; + if (flags & S_NOATIME) + tozu_flags |= FS_NOATIME_FL; + if (flags & S_DIRSYNC) + tozu_flags |= FS_DIRSYNC_FL; + + zi->i_flags = cpu_to_le32(tozu_flags); +} + +/* Mask out flags that are inappropriate for the given type of inode. */ +static __le32 _mask_flags(umode_t mode, __le32 flags) +{ + flags &= cpu_to_le32(ZUFS_FL_INHERITED); + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & cpu_to_le32(ZUFS_REG_FLMASK); + else + return flags & cpu_to_le32(ZUFS_OTHER_FLMASK); +} + +static int _set_zi_from_inode(struct inode *dir, struct zus_inode *zi, + struct inode *inode) +{ + struct zus_inode *zidir = zus_zi(dir); + + if (unlikely(!zidir)) + return -EACCES; + + zi->i_flags = _mask_flags(inode->i_mode, zidir->i_flags); + zi->i_mode = cpu_to_le16(inode->i_mode); + zi->i_uid = cpu_to_le32(__kuid_val(inode->i_uid)); + zi->i_gid = cpu_to_le32(__kgid_val(inode->i_gid)); + /* NOTE: zus is boss of i_nlink (but let it know what we think) */ + zi->i_nlink = cpu_to_le16(inode->i_nlink); + zi->i_size = cpu_to_le64(inode->i_size); + zi->i_blocks = cpu_to_le64(inode->i_blocks); + timespec_to_mt(&zi->i_atime, &inode->i_atime); + timespec_to_mt(&zi->i_mtime, &inode->i_mtime); + timespec_to_mt(&zi->i_ctime, &inode->i_ctime); + zi->i_generation = cpu_to_le32(inode->i_generation); + tozu_get_inode_flags(inode, zi); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + zi->i_rdev = cpu_to_le32(inode->i_rdev); + + return 0; +} + +static bool _times_equal(struct timespec *t, __le64 *mt) +{ + __le64 time; + + timespec_to_mt(&time, t); + return time == *mt; +} + +/* This function checks if VFS's inode and zus_inode are in sync */ +static void _warn_inode_dirty(struct inode *inode, struct zus_inode *zi) +{ +#define __MISMACH_INT(inode, X, Y) \ + if (X != Y) \ + zuf_warn("[%ld] " #X"=0x%lx " #Y"=0x%lx""\n", \ + inode->i_ino, (ulong)(X), (ulong)(Y)) +#define __MISMACH_TIME(inode, X, Y) \ + if (!_times_equal(X, Y)) { \ + struct timespec t; \ + mt_to_timespec(&t, (Y));\ + zuf_warn("[%ld] " #X"=%ld:%ld " #Y"=%ld:%ld""\n", \ + inode->i_ino, (X)->tv_sec, (X)->tv_nsec, \ + t.tv_sec, t.tv_nsec); \ + } + + if (!_times_equal(&inode->i_ctime, &zi->i_ctime) || + !_times_equal(&inode->i_mtime, &zi->i_mtime) || + !_times_equal(&inode->i_atime, &zi->i_atime) || + inode->i_size != le64_to_cpu(zi->i_size) || + inode->i_mode != le16_to_cpu(zi->i_mode) || + __kuid_val(inode->i_uid) != le32_to_cpu(zi->i_uid) || + __kgid_val(inode->i_gid) != le32_to_cpu(zi->i_gid) || + inode->i_nlink != le16_to_cpu(zi->i_nlink) || + inode->i_ino != _zi_ino(zi) || + inode->i_blocks != le64_to_cpu(zi->i_blocks)) { + __MISMACH_TIME(inode, &inode->i_ctime, &zi->i_ctime); + __MISMACH_TIME(inode, &inode->i_mtime, &zi->i_mtime); + __MISMACH_TIME(inode, &inode->i_atime, &zi->i_atime); + __MISMACH_INT(inode, inode->i_size, le64_to_cpu(zi->i_size)); + __MISMACH_INT(inode, inode->i_mode, le16_to_cpu(zi->i_mode)); + __MISMACH_INT(inode, __kuid_val(inode->i_uid), + le32_to_cpu(zi->i_uid)); + __MISMACH_INT(inode, __kgid_val(inode->i_gid), + le32_to_cpu(zi->i_gid)); + __MISMACH_INT(inode, inode->i_nlink, le16_to_cpu(zi->i_nlink)); + __MISMACH_INT(inode, inode->i_ino, _zi_ino(zi)); + __MISMACH_INT(inode, inode->i_blocks, + le64_to_cpu(zi->i_blocks)); + } +} + +static void _zii_connect(struct inode *inode, struct zus_inode *zi, + struct zus_inode_info *zus_ii) +{ + struct zuf_inode_info *zii = ZUII(inode); + + zii->zi = zi; + zii->zus_ii = zus_ii; +} + struct inode *zuf_iget(struct super_block *sb, struct zus_inode_info *zus_ii, zu_dpp_t _zi, bool *exist) { - return ERR_PTR(-ENOMEM); + struct zuf_sb_info *sbi = SBI(sb); + struct zus_inode *zi = md_addr_verify(sbi->md, _zi); + struct inode *inode; + + if (unlikely(!zi)) { + /* Don't trust ZUS pointers */ + zuf_err("Bad zus_inode 0x%llx\n", _zi); + return ERR_PTR(-EIO); + } + if (unlikely(!zus_ii)) { + zuf_err("zus_ii NULL\n"); + return ERR_PTR(-EIO); + } + + if (!_zi_valid(zi)) { + zuf_err("inactive node ino=%lld links=%d mode=%d\n", zi->i_ino, + zi->i_nlink, zi->i_mode); + return ERR_PTR(-ESTALE); + } + + zuf_dbg_zus("[%lld] size=0x%llx, blocks=0x%llx ct=0x%llx mt=0x%llx link=0x%x mode=0x%x xattr=0x%llx\n", + zi->i_ino, zi->i_size, zi->i_blocks, zi->i_ctime, + zi->i_mtime, zi->i_nlink, zi->i_mode, zi->i_xattr); + + inode = iget_locked(sb, _zi_ino(zi)); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + + if (!(inode->i_state & I_NEW)) { + *exist = true; + return inode; + } + + *exist = false; + _set_inode_from_zi(inode, zi); + _zii_connect(inode, zi, zus_ii); + + unlock_new_inode(inode); + return inode; +} + +int zuf_evict_dispatch(struct super_block *sb, struct zus_inode_info *zus_ii, + int operation) +{ + struct zufs_ioc_evict_inode ioc_evict_inode = { + .hdr.in_len = sizeof(ioc_evict_inode), + .hdr.out_len = sizeof(ioc_evict_inode), + .hdr.operation = operation, + .zus_ii = zus_ii, + }; + int err; + + err = zufs_dispatch(ZUF_ROOT(SBI(sb)), &ioc_evict_inode.hdr, NULL, 0); + if (unlikely(err)) + zuf_err("zufs_dispatch failed op=%d => %d\n", + operation, err); + return err; } void zuf_evict_inode(struct inode *inode) { + struct super_block *sb = inode->i_sb; + struct zuf_inode_info *zii = ZUII(inode); + int operation; + int write_mapped; + + if (!inode->i_nlink) { + if (unlikely(!zii->zi)) { + zuf_dbg_err("[%ld] inode without zi mode=0x%x size=0x%llx\n", + inode->i_ino, inode->i_mode, inode->i_size); + goto out; + } + + if (unlikely(is_bad_inode(inode))) + zuf_warn("[%ld] inode is bad mode=0x%x zi=%p\n", + inode->i_ino, inode->i_mode, zii->zi); + else + _warn_inode_dirty(inode, zii->zi); + + operation = ZUS_OP_FREE_INODE; + } else { + zuf_dbg_verbose("[%ld] inode is going down?\n", inode->i_ino); + + if (unlikely(!inode || !sb || !sb->s_root || + !sb->s_root->d_inode || + !sb->s_root->d_inode->i_mapping)) + goto out; + + operation = ZUS_OP_EVICT_INODE; + } + + zuf_evict_dispatch(sb, zii->zus_ii, operation); + +out: + zii->zus_ii = NULL; + zii->zi = NULL; + + if (zii && zii->zero_page) { + zii->zero_page->mapping = NULL; + __free_pages(zii->zero_page, 0); + zii->zero_page = NULL; + } + + /* ZUS on evict has synced all mmap dirty pages, YES? */ + write_mapped = atomic_read(&zii->write_mapped); + if (unlikely(write_mapped || !list_empty(&zii->i_mmap_dirty))) { + zuf_dbg_mmap("[%ld] !!!! write_mapped=%d list_empty=%d\n", + inode->i_ino, write_mapped, + list_empty(&zii->i_mmap_dirty)); + zuf_sync_dec(inode, write_mapped); + } + + clear_inode(inode); +} + +/* @rdev_or_isize is i_size in the case of a symlink + * and rdev in the case of special-files + */ +struct inode *zuf_new_inode(struct inode *dir, umode_t mode, + const struct qstr *qstr, const char *symname, + ulong rdev_or_isize, bool tmpfile) +{ + struct super_block *sb = dir->i_sb; + struct zuf_sb_info *sbi = SBI(sb); + struct zufs_ioc_new_inode ioc_new_inode = { + .hdr.in_len = sizeof(ioc_new_inode), + .hdr.out_len = sizeof(ioc_new_inode), + .hdr.operation = ZUS_OP_NEW_INODE, + .dir_ii = ZUII(dir)->zus_ii, + .flags = tmpfile ? ZI_TMPFILE : 0, + .str.len = qstr->len, + }; + struct inode *inode; + struct zus_inode *zi; + struct page *pages[2]; + uint nump = 0; + int err; + + memcpy(&ioc_new_inode.str.name, qstr->name, qstr->len); + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode_init_owner(inode, dir, mode); + inode->i_blocks = inode->i_size = 0; + inode->i_ctime = inode->i_mtime = current_time(dir); + inode->i_atime = inode->i_ctime; + + zuf_dbg_verbose("inode=%p name=%s\n", inode, qstr->name); + + zuf_set_inode_flags(inode, &ioc_new_inode.zi); + + err = _set_zi_from_inode(dir, &ioc_new_inode.zi, inode); + if (unlikely(err)) + goto fail; + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, mode, rdev_or_isize); + } else if (symname) { + inode->i_size = rdev_or_isize; + nump = zuf_prepare_symname(&ioc_new_inode, symname, + rdev_or_isize, pages); + } + + err = zufs_dispatch(ZUF_ROOT(sbi), &ioc_new_inode.hdr, pages, nump); + if (unlikely(err)) { + zuf_err("zufs_dispatch failed => %d\n", err); + goto fail; + } + zi = md_addr(sbi->md, ioc_new_inode._zi); + + _zii_connect(inode, zi, ioc_new_inode.zus_ii); + + /* update inode fields from filesystem inode */ + inode->i_ino = le64_to_cpu(zi->i_ino); + inode->i_size = le64_to_cpu(zi->i_size); + inode->i_generation = le64_to_cpu(zi->i_generation); + inode->i_blocks = le64_to_cpu(zi->i_blocks); + set_nlink(inode, le16_to_cpu(zi->i_nlink)); + i_size_write(dir, le64_to_cpu(zus_zi(dir)->i_size)); + + zuf_dbg_zus("[%lld] size=0x%llx, blocks=0x%llx ct=0x%llx mt=0x%llx link=0x%x mode=0x%x xattr=0x%llx\n", + zi->i_ino, zi->i_size, zi->i_blocks, zi->i_ctime, + zi->i_mtime, zi->i_nlink, zi->i_mode, zi->i_xattr); + + zuf_dbg_verbose("allocating inode %ld (zi=%p)\n", _zi_ino(zi), zi); + + err = insert_inode_locked(inode); + if (unlikely(err)) { + zuf_err("[%ld:%s] generation=%lld insert_inode_locked => %d\n", + inode->i_ino, qstr->name, zi->i_generation, err); + goto fail; + } + + return inode; + +fail: + clear_nlink(inode); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); } int zuf_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -38,8 +446,211 @@ int zuf_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } -/* This function is called by msync(), fsync() && sync_fs(). */ -int zuf_isync(struct inode *inode, loff_t start, loff_t end, int datasync) +/* + * Mostly supporting file_accessed() for now. Which is the only one we use. + * + * But also file_update_time is used by fifo code. + */ +int zuf_update_time(struct inode *inode, struct timespec *time, int flags) +{ + struct zus_inode *zi = zus_zi(inode); + struct zufs_ioc_attr ioc_attr = { + .hdr.in_len = sizeof(ioc_attr), + .hdr.out_len = sizeof(ioc_attr), + .hdr.operation = ZUS_OP_UPDATE_TIME, + .zus_ii = ZUII(inode)->zus_ii, + }; + int err; + + if (flags & S_ATIME) { + ioc_attr.zuf_attr |= STATX_ATIME; + inode->i_atime = *time; + timespec_to_mt(&zi->i_atime, &inode->i_atime); + } + + /* for Support of file_update_time() */ + if ((flags & S_CTIME) || (flags & S_MTIME) || (flags & S_VERSION)) { + if (flags & S_VERSION) { + ioc_attr.zuf_attr |= ZUFS_STATX_VERSION; + inode_inc_iversion(inode); + zi->i_generation = cpu_to_le64(inode->i_version); + } + if (flags & S_CTIME) { + ioc_attr.zuf_attr |= STATX_CTIME; + inode->i_ctime = *time; + timespec_to_mt(&zi->i_ctime, &inode->i_ctime); + } + if (flags & S_MTIME) { + ioc_attr.zuf_attr |= STATX_MTIME; + inode->i_mtime = *time; + timespec_to_mt(&zi->i_mtime, &inode->i_mtime); + } + } + + if (ioc_attr.zuf_attr == 0) + return 0; + + err = zufs_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_attr.hdr, NULL, 0); + if (unlikely(err)) + zuf_err("zufs_dispatch failed => %d\n", err); + + return err; +} + +int zuf_getattr(const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int flags) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); + + if (inode->i_flags & S_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + if (inode->i_flags & S_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE); + generic_fillattr(inode, stat); + /* stat->blocks should be the number of 512B blocks */ + stat->blocks = inode->i_blocks << (inode->i_sb->s_blocksize_bits - 9); + + return 0; +} + +int zuf_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct zuf_inode_info *zii = ZUII(inode); + struct zus_inode *zi = zii->zi; + struct zufs_ioc_attr ioc_attr = { + .hdr.in_len = sizeof(ioc_attr), + .hdr.out_len = sizeof(ioc_attr), + .hdr.operation = ZUS_OP_SETATTR, + .zus_ii = zii->zus_ii, + }; + int err; + + if (!zi) + return -EACCES; + + err = setattr_prepare(dentry, attr); + if (unlikely(err)) + return err; + + if (attr->ia_valid & ATTR_MODE) { + zuf_dbg_vfs("[%ld] ATTR_MODE=0x%x\n", + inode->i_ino, attr->ia_mode); + ioc_attr.zuf_attr |= STATX_MODE; + inode->i_mode = attr->ia_mode; + zi->i_mode = cpu_to_le16(inode->i_mode); + if (test_opt(SBI(inode->i_sb), POSIXACL)) { + err = posix_acl_chmod(inode, inode->i_mode); + if (unlikely(err)) + return err; + } + } + + if (attr->ia_valid & ATTR_UID) { + zuf_dbg_vfs("[%ld] ATTR_UID=0x%x\n", + inode->i_ino, __kuid_val(attr->ia_uid)); + ioc_attr.zuf_attr |= STATX_UID; + inode->i_uid = attr->ia_uid; + zi->i_uid = cpu_to_le32(__kuid_val(inode->i_uid)); + } + if (attr->ia_valid & ATTR_GID) { + zuf_dbg_vfs("[%ld] ATTR_GID=0x%x\n", + inode->i_ino, __kgid_val(attr->ia_gid)); + ioc_attr.zuf_attr |= STATX_GID; + inode->i_gid = attr->ia_gid; + zi->i_gid = cpu_to_le32(__kgid_val(inode->i_gid)); + } + + if ((attr->ia_valid & ATTR_SIZE)) { + zuf_dbg_vfs("[%ld] ATTR_SIZE=0x%llx\n", + inode->i_ino, attr->ia_size); + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) { + zuf_err("[%ld] wrong file mode=%x\n", + inode->i_ino, inode->i_mode); + return -EINVAL; + } + ioc_attr.zuf_attr |= STATX_SIZE; + + ZUF_CHECK_I_W_LOCK(inode); + zuf_smw_lock(zii); + + /* Make all mmap() users FAULT for truncated pages */ + unmap_mapping_range(inode->i_mapping, + attr->ia_size + PAGE_SIZE - 1, 0, 1); + + ioc_attr.truncate_size = attr->ia_size; + /* on attr_size we want to update times as well */ + attr->ia_valid |= ATTR_CTIME | ATTR_MTIME; + } + + if (attr->ia_valid & ATTR_ATIME) { + ioc_attr.zuf_attr |= STATX_ATIME; + inode->i_atime = attr->ia_atime; + timespec_to_mt(&zi->i_atime, &inode->i_atime); + zuf_dbg_vfs("[%ld] ATTR_ATIME=0x%llx\n", + inode->i_ino, zi->i_atime); + } + if (attr->ia_valid & ATTR_CTIME) { + ioc_attr.zuf_attr |= STATX_CTIME; + inode->i_ctime = attr->ia_ctime; + timespec_to_mt(&zi->i_ctime, &inode->i_ctime); + zuf_dbg_vfs("[%ld] ATTR_CTIME=0x%llx\n", + inode->i_ino, zi->i_ctime); + } + if (attr->ia_valid & ATTR_MTIME) { + ioc_attr.zuf_attr |= STATX_MTIME; + inode->i_mtime = attr->ia_mtime; + timespec_to_mt(&zi->i_mtime, &inode->i_mtime); + zuf_dbg_vfs("[%ld] ATTR_MTIME=0x%llx\n", + inode->i_ino, zi->i_mtime); + } + + err = zufs_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_attr.hdr, NULL, 0); + if (unlikely(err)) + zuf_err("zufs_dispatch failed => %d\n", err); + + if ((attr->ia_valid & ATTR_SIZE)) { + i_size_write(inode, le64_to_cpu(zi->i_size)); + inode->i_blocks = le64_to_cpu(zi->i_blocks); + + zuf_smw_unlock(zii); + } + + return err; +} + +void zuf_set_inode_flags(struct inode *inode, struct zus_inode *zi) +{ + unsigned int flags = le32_to_cpu(zi->i_flags); + + inode->i_flags &= + ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC); + if (flags & FS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & FS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + if (!zi->i_xattr) + inode_has_no_xattr(inode); +} + +/* direct_IO is not called. We set an empty one so open(O_DIRECT) will be happy + */ +static ssize_t zuf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + WARN_ON(1); return 0; } +const struct address_space_operations zuf_aops = { + .direct_IO = zuf_direct_IO, +}; diff --git a/fs/zuf/namei.c b/fs/zuf/namei.c new file mode 100644 index 0000000..179069b --- /dev/null +++ b/fs/zuf/namei.c @@ -0,0 +1,421 @@ +/* + * BRIEF DESCRIPTION + * + * Inode operations for directories. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh + * Sagi Manole " + */ +#include +#include "zuf.h" + + +static struct inode *d_parent(struct dentry *dentry) +{ + return dentry->d_parent->d_inode; +} + +static void _instantiate_unlock(struct dentry *dentry, struct inode *inode) +{ + d_instantiate(dentry, inode); + unlock_new_inode(inode); +} + +static struct dentry *zuf_lookup(struct inode *dir, struct dentry *dentry, + uint flags) +{ + struct super_block *sb = dir->i_sb; + struct qstr *str = &dentry->d_name; + uint in_len = offsetof(struct zufs_ioc_lookup, _zi); + struct zufs_ioc_lookup ioc_lu = { + .hdr.in_len = in_len, + .hdr.out_start = in_len, + .hdr.out_len = sizeof(ioc_lu) - in_len, + .hdr.operation = ZUS_OP_LOOKUP, + .dir_ii = ZUII(dir)->zus_ii, + .str.len = str->len, + }; + struct inode *inode = NULL; + bool exist; + int err; + + zuf_dbg_vfs("[%ld] dentry-name=%s\n", dir->i_ino, dentry->d_name.name); + + if (dentry->d_name.len > ZUFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + memcpy(&ioc_lu.str.name, str->name, str->len); + + err = zufs_dispatch(ZUF_ROOT(SBI(sb)), &ioc_lu.hdr, NULL, 0); + if (unlikely(err)) { + zuf_dbg_err("zufs_dispatch failed => %d\n", err); + goto out; + } + + inode = zuf_iget(dir->i_sb, ioc_lu.zus_ii, ioc_lu._zi, &exist); + if (exist) { + zuf_dbg_err("race in lookup\n"); + zuf_evict_dispatch(sb, ioc_lu.zus_ii, ZUS_OP_EVICT_INODE); + } + +out: + return d_splice_alias(inode, dentry); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int zuf_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + + zuf_dbg_vfs("[%ld] dentry-name=%s mode=0x%x\n", + dir->i_ino, dentry->d_name.name, mode); + + inode = zuf_new_inode(dir, mode, &dentry->d_name, NULL, 0, false); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &zuf_file_inode_operations; + inode->i_mapping->a_ops = &zuf_aops; + inode->i_fop = &zuf_file_operations; + + _instantiate_unlock(dentry, inode); + + return 0; +} + +static int zuf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + struct inode *inode; + + zuf_dbg_vfs("[%ld] mode=0x%x rdev=0x%x\n", dir->i_ino, mode, rdev); + + inode = zuf_new_inode(dir, mode, &dentry->d_name, NULL, rdev, false); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &zuf_special_inode_operations; + + _instantiate_unlock(dentry, inode); + + return 0; +} + +static int zuf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = zuf_new_inode(dir, mode, &dentry->d_name, NULL, 0, true); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + /* TODO: See about more ephemeral operations on this file, around + * mmap and such. + * Must see about that tmpfile mode that is later link_at + * (probably the !O_EXCL flag) + */ + inode->i_op = &zuf_file_inode_operations; + inode->i_mapping->a_ops = &zuf_aops; + inode->i_fop = &zuf_file_operations; + + set_nlink(inode, 1); /* user_mode knows nothing */ + d_tmpfile(dentry, inode); + /* tmpfile operate on nlink=0. Since this is a tmp file we do not care + * about cl_flushing. If later this file will be linked to a dir. the + * add_dentry will flush the zi. + */ + zus_zi(inode)->i_nlink = inode->i_nlink; + + unlock_new_inode(inode); + return 0; +} + +static int zuf_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + ulong len; + + zuf_dbg_vfs("[%ld] de->name=%s symname=%s\n", + dir->i_ino, dentry->d_name.name, symname); + + len = strlen(symname); + if (len + 1 > ZUFS_MAX_SYMLINK) + return -ENAMETOOLONG; + + inode = zuf_new_inode(dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, + symname, len, false); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &zuf_symlink_inode_operations; + inode->i_mapping->a_ops = &zuf_aops; + + _instantiate_unlock(dentry, inode); + + return 0; +} + +static int zuf_link(struct dentry *dest_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = dest_dentry->d_inode; + int err; + + zuf_dbg_vfs("[%ld] dentry-ino=%ld dentry-name=%s dentry-parent=%ld dest_d-ino=%ld dest_d-name=%s\n", + dir->i_ino, inode->i_ino, dentry->d_name.name, + d_parent(dentry)->i_ino, + dest_dentry->d_inode->i_ino, dest_dentry->d_name.name); + + if (inode->i_nlink >= ZUFS_LINK_MAX) + return -EMLINK; + + ihold(inode); + + err = zuf_add_dentry(dir, &dentry->d_name, inode, false); + if (unlikely(err)) { + iput(inode); + return err; + } + + inode->i_ctime = current_time(dir); + + set_nlink(inode, le16_to_cpu(zus_zi(inode)->i_nlink)); + + d_instantiate(dentry, inode); + + return 0; +} + +static int zuf_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err; + + zuf_dbg_vfs("[%ld] dentry-ino=%ld dentry-name=%s dentry-parent=%ld\n", + dir->i_ino, inode->i_ino, dentry->d_name.name, + d_parent(dentry)->i_ino); + + err = zuf_remove_dentry(dir, &dentry->d_name); + if (unlikely(err)) + return err; + + inode->i_ctime = dir->i_ctime; + + set_nlink(inode, le16_to_cpu(ZUII(inode)->zi->i_nlink)); + + return 0; +} + +static int zuf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + zuf_dbg_vfs("[%ld] dentry-name=%s dentry-parent=%ld mode=0x%x\n", + dir->i_ino, dentry->d_name.name, d_parent(dentry)->i_ino, + mode); + + if (dir->i_nlink >= ZUFS_LINK_MAX) + return -EMLINK; + + inode = zuf_new_inode(dir, S_IFDIR | mode, &dentry->d_name, NULL, 0, + false); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &zuf_dir_inode_operations; + inode->i_fop = &zuf_dir_operations; + inode->i_mapping->a_ops = &zuf_aops; + + set_nlink(dir, le16_to_cpu(ZUII(inode)->zi->i_nlink)); + + _instantiate_unlock(dentry, inode); + + return 0; +} + +static bool _empty_dir(struct inode *dir) +{ + if (dir->i_nlink != 2) { + zuf_warn("[%ld] directory has nlink(%d) != 2\n", + dir->i_ino, dir->i_nlink); + return false; + } + /* NOTE: Above is not the only -ENOTEMPTY the zus-fs will need to check + * for the "only-files" no subdirs case. And return -ENOTEMPTY below + */ + return true; +} + +static int zuf_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err; + + zuf_dbg_vfs("[%ld] dentry-ino=%ld dentry-name=%s dentry-parent=%ld\n", + dir->i_ino, inode->i_ino, dentry->d_name.name, + d_parent(dentry)->i_ino); + + if (!inode) + return -ENOENT; + + if (!_empty_dir(inode)) + return -ENOTEMPTY; + + err = zuf_remove_dentry(dir, &dentry->d_name); + if (unlikely(err)) + return err; + + inode->i_ctime = dir->i_ctime; + + set_nlink(inode, le16_to_cpu(zus_zi(inode)->i_nlink)); + set_nlink(dir, le16_to_cpu(zus_zi(dir)->i_nlink)); + + return 0; +} + +/* Structure of a directory element; */ +struct zuf_dir_element { + __le64 ino; + char name[254]; +}; + +static int _rename_exchange(struct inode *old_inode, struct inode *new_inode, + struct inode *old_dir, struct inode *new_dir) +{ + /* A subdir holds a ref on parent, see if we need to exchange refs */ + if ((S_ISDIR(old_inode->i_mode) != S_ISDIR(new_inode->i_mode)) && + (old_dir != new_dir)) { + if (S_ISDIR(old_inode->i_mode)) { + if (ZUFS_LINK_MAX <= new_dir->i_nlink) + return -EMLINK; + } else { + if (ZUFS_LINK_MAX <= old_dir->i_nlink) + return -EMLINK; + } + } + + set_nlink(old_dir, le16_to_cpu(zus_zi(old_dir)->i_nlink)); + set_nlink(new_dir, le16_to_cpu(zus_zi(new_dir)->i_nlink)); + + /* Update Directory times */ + mt_to_timespec(&old_dir->i_mtime, &zus_zi(old_dir)->i_mtime); + mt_to_timespec(&old_dir->i_ctime, &zus_zi(old_dir)->i_ctime); + if (old_dir != new_dir) { + mt_to_timespec(&new_dir->i_mtime, &zus_zi(new_dir)->i_mtime); + mt_to_timespec(&new_dir->i_ctime, &zus_zi(new_dir)->i_ctime); + } + return 0; +} + +static int zuf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + uint flags) +{ + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct zuf_sb_info *sbi = SBI(old_inode->i_sb); + struct zufs_ioc_rename ioc_rename = { + .hdr.in_len = sizeof(ioc_rename), + .hdr.out_len = sizeof(ioc_rename), + .hdr.operation = ZUS_OP_RENAME, + .old_dir_ii = ZUII(old_dir)->zus_ii, + .new_dir_ii = ZUII(new_dir)->zus_ii, + .old_zus_ii = old_inode ? ZUII(old_inode)->zus_ii : NULL, + .new_zus_ii = new_inode ? ZUII(new_inode)->zus_ii : NULL, + .old_d_str.len = old_dentry->d_name.len, + .new_d_str.len = new_dentry->d_name.len, + }; + struct timespec time = current_time(old_dir); + int err; + + zuf_dbg_vfs("old_inode=%ld new_inode=%ld old_name=%s new_name=%s f=0x%x\n", + old_inode ? old_inode->i_ino : 0, + new_inode ? new_inode->i_ino : 0, old_dentry->d_name.name, + new_dentry->d_name.name, flags); + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE /*| RENAME_WHITEOUT*/)) + return -EINVAL; + + if (!(flags & RENAME_EXCHANGE) && S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + if (!_empty_dir(new_inode)) + return -ENOTEMPTY; + } else if (ZUFS_LINK_MAX <= new_dir->i_nlink) { + return -EMLINK; + } + } + + memcpy(&ioc_rename.old_d_str.name, old_dentry->d_name.name, + old_dentry->d_name.len); + memcpy(&ioc_rename.new_d_str.name, new_dentry->d_name.name, + new_dentry->d_name.len); + timespec_to_mt(&ioc_rename.time, &time); + + err = zufs_dispatch(ZUF_ROOT(sbi), &ioc_rename.hdr, NULL, 0); + if (unlikely(err)) { + zuf_err("zufs_dispatch failed => %d\n", err); + return err; + } + + if (flags & RENAME_EXCHANGE) + return _rename_exchange(old_inode, new_inode, old_dir, new_dir); + + mt_to_timespec(&new_dir->i_mtime, &zus_zi(new_dir)->i_mtime); + mt_to_timespec(&new_dir->i_ctime, &zus_zi(new_dir)->i_ctime); + + if (new_inode) { + struct zus_inode *new_zi = zus_zi(new_inode); + + set_nlink(new_inode, le16_to_cpu(new_zi->i_nlink)); + mt_to_timespec(&new_inode->i_ctime, &new_zi->i_ctime); + } else { + struct zus_inode *old_zi = zus_zi(old_inode); + + mt_to_timespec(&old_inode->i_ctime, &old_zi->i_ctime); + } + + if (S_ISDIR(old_inode->i_mode)) { + set_nlink(old_dir, le16_to_cpu(zus_zi(old_dir)->i_nlink)); + if (!new_inode) + set_nlink(new_dir, + le16_to_cpu(zus_zi(new_dir)->i_nlink)); + } + + return 0; +} + +const struct inode_operations zuf_dir_inode_operations = { + .create = zuf_create, + .lookup = zuf_lookup, + .link = zuf_link, + .unlink = zuf_unlink, + .symlink = zuf_symlink, + .mkdir = zuf_mkdir, + .rmdir = zuf_rmdir, + .mknod = zuf_mknod, + .tmpfile = zuf_tmpfile, + .rename = zuf_rename, + .setattr = zuf_setattr, + .getattr = zuf_getattr, + .update_time = zuf_update_time, +}; + +const struct inode_operations zuf_special_inode_operations = { + .setattr = zuf_setattr, + .getattr = zuf_getattr, + .update_time = zuf_update_time, +}; diff --git a/fs/zuf/symlink.c b/fs/zuf/symlink.c new file mode 100644 index 0000000..8188225 --- /dev/null +++ b/fs/zuf/symlink.c @@ -0,0 +1,76 @@ +/* + * BRIEF DESCRIPTION + * + * Symlink operations + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0 OR BSD-3-Clause. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh + * Sagi Manole " + */ + +#include "zuf.h" + +/* Can never fail all checks already made before. + * Returns: The number of pages stored @pages + */ +uint zuf_prepare_symname(struct zufs_ioc_new_inode *ioc_new_inode, + const char *symname, ulong len, + struct page *pages[2]) +{ + uint nump; + + ioc_new_inode->zi.i_size = cpu_to_le64(len); + if (len < sizeof(ioc_new_inode->zi.i_symlink)) { + memcpy(&ioc_new_inode->zi.i_symlink, symname, len); + return 0; + } + + pages[0] = virt_to_page(symname); + nump = 1; + + ioc_new_inode->hdr.len = len; + ioc_new_inode->hdr.offset = (ulong)symname & (PAGE_SIZE - 1); + + if (PAGE_SIZE < ioc_new_inode->hdr.offset + len) { + pages[1] = virt_to_page(symname + PAGE_SIZE); + ++nump; + } + + return nump; +} + +static const char *zuf_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *notused) +{ + struct zuf_inode_info *zii = ZUII(inode); + struct zufs_ioc_get_link ioc_get_link = { + .hdr.in_len = sizeof(ioc_get_link), + .hdr.out_len = sizeof(ioc_get_link), + .hdr.operation = ZUS_OP_GET_SYMLINK, + .zus_ii = zii->zus_ii, + }; + int err; + + if (inode->i_size < sizeof(zii->zi->i_symlink)) + return zii->zi->i_symlink; + + err = zufs_dispatch(ZUF_ROOT(SBI(inode->i_sb)), &ioc_get_link.hdr, + NULL, 0); + if (unlikely(err)) { + zuf_err("zufs_dispatch failed => %d\n", err); + return ERR_PTR(err); + } + + return md_addr(SBI(inode->i_sb)->md, ioc_get_link._link); +} + +const struct inode_operations zuf_symlink_inode_operations = { + .get_link = zuf_get_link, + .update_time = zuf_update_time, + .setattr = zuf_setattr, + .getattr = zuf_getattr, +}; diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h index d461782..5870d63 100644 --- a/fs/zuf/zus_api.h +++ b/fs/zuf/zus_api.h @@ -20,6 +20,10 @@ #include #include +/* TODO: Someone forgot i_version for STATX_ attrs should send a patch to add it + */ +#define ZUFS_STATX_VERSION 0x40000000U + /* * Version rules: * This is the zus-to-zuf API version. And not the Filesystem @@ -337,6 +341,28 @@ enum e_zufs_operation { ZUS_OP_NULL = 0, ZUS_OP_STATFS, + ZUS_OP_NEW_INODE, + ZUS_OP_FREE_INODE, + ZUS_OP_EVICT_INODE, + + ZUS_OP_LOOKUP, + ZUS_OP_ADD_DENTRY, + ZUS_OP_REMOVE_DENTRY, + ZUS_OP_RENAME, + ZUS_OP_READDIR, + ZUS_OP_CLONE, + ZUS_OP_COPY, + + ZUS_OP_READ, + ZUS_OP_WRITE, + ZUS_OP_GET_BLOCK, + ZUS_OP_GET_SYMLINK, + ZUS_OP_SETATTR, + ZUS_OP_UPDATE_TIME, + ZUS_OP_SYNC, + ZUS_OP_FALLOCATE, + ZUS_OP_LLSEEK, + ZUS_OP_BREAK, /* Kernel telling Server to exit */ ZUS_OP_MAX_OPT, }; @@ -351,4 +377,212 @@ struct zufs_ioc_statfs { struct statfs64 statfs_out; }; +/* zufs_ioc_new_inode flags: */ +enum zi_flags { + ZI_TMPFILE = 1, /* for new_inode */ + ZI_LOOKUP_RACE = 1, /* for evict */ +}; + +struct zufs_str { + __u8 len; + char name[ZUFS_NAME_LEN]; +}; + +/* ZUS_OP_NEW_INODE */ +struct zufs_ioc_new_inode { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode zi; + struct zus_inode_info *dir_ii; /* If mktmp this is the root */ + struct zufs_str str; + __u64 flags; + + /* OUT */ + zu_dpp_t _zi; + struct zus_inode_info *zus_ii; +}; + +/* ZUS_OP_FREE_INODE, ZUS_OP_EVICT_INODE */ +struct zufs_ioc_evict_inode { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *zus_ii; + __u64 flags; +}; + +/* ZUS_OP_LOOKUP */ +struct zufs_ioc_lookup { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *dir_ii; + struct zufs_str str; + + /* OUT */ + zu_dpp_t _zi; + struct zus_inode_info *zus_ii; +}; + +/* ZUS_OP_ADD_DENTRY, ZUS_OP_REMOVE_DENTRY */ +struct zufs_ioc_dentry { + struct zufs_ioc_hdr hdr; + struct zus_inode_info *zus_ii; /* IN */ + struct zus_inode_info *zus_dir_ii; /* IN */ + struct zufs_str str; /* IN */ + __u64 ino; /* OUT - only for lookup */ +}; + +/* ZUS_OP_RENAME */ +struct zufs_ioc_rename { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *old_dir_ii; + struct zus_inode_info *new_dir_ii; + struct zus_inode_info *old_zus_ii; + struct zus_inode_info *new_zus_ii; + struct zufs_str old_d_str; + struct zufs_str new_d_str; + __le64 time; +}; + +/* ZUS_OP_READDIR */ +struct zufs_ioc_readdir { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *dir_ii; + loff_t pos; + + /* OUT */ + __u8 more; +}; + +struct zufs_dir_entry { + __le64 ino; + struct { + unsigned type : 8; + ulong pos : 56; + }; + struct zufs_str zstr; +}; + +struct zufs_readdir_iter { + void *__zde, *last; + struct zufs_ioc_readdir *ioc_readdir; +}; + +enum {E_ZDE_HDR_SIZE = + offsetof(struct zufs_dir_entry, zstr) + offsetof(struct zufs_str, name), +}; + +static inline void zufs_readdir_iter_init(struct zufs_readdir_iter *rdi, + struct zufs_ioc_readdir *ioc_readdir, + void *app_ptr) +{ + rdi->__zde = app_ptr; + rdi->last = app_ptr + ioc_readdir->hdr.len; + rdi->ioc_readdir = ioc_readdir; + ioc_readdir->more = false; +} + +static inline uint zufs_dir_entry_len(__u8 name_len) +{ + return ALIGN(E_ZDE_HDR_SIZE + name_len, sizeof(__u64)); +} + +static inline +struct zufs_dir_entry *zufs_next_zde(struct zufs_readdir_iter *rdi) +{ + struct zufs_dir_entry *zde = rdi->__zde; + uint len; + + if (rdi->last <= rdi->__zde + E_ZDE_HDR_SIZE) + return NULL; + if (zde->zstr.len == 0) + return NULL; + len = zufs_dir_entry_len(zde->zstr.len); + if (rdi->last <= rdi->__zde + len) + return NULL; + + rdi->__zde += len; + return zde; +} + +static inline bool zufs_zde_emit(struct zufs_readdir_iter *rdi, __u64 ino, + __u8 type, __u64 pos, const char *name, + __u8 len) +{ + struct zufs_dir_entry *zde = rdi->__zde; + + if (rdi->last <= rdi->__zde + zufs_dir_entry_len(len)) { + rdi->ioc_readdir->more = true; + return false; + } + + rdi->ioc_readdir->more = 0; + zde->ino = ino; + zde->type = type; + /*ASSERT(0 == (pos && (1 << 56 - 1)));*/ + zde->pos = pos; + strncpy(zde->zstr.name, name, len); + zde->zstr.len = len; + zufs_next_zde(rdi); + + return true; +} + +/* ZUS_OP_GET_SYMLINK */ +struct zufs_ioc_get_link { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *zus_ii; + + /* OUT */ + zu_dpp_t _link; +}; + +/* ZUS_OP_SETATTR */ +struct zufs_ioc_attr { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *zus_ii; + __u64 truncate_size; + __u32 zuf_attr; + __u32 pad; +}; + +/* ZUS_OP_ISYNC, ZUS_OP_FALLOCATE */ +struct zufs_ioc_range { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *zus_ii; + __u64 offset, length; + __u32 opflags; + __u32 pad; + + /* OUT */ + __u64 write_unmapped; +}; + +/* ZUS_OP_CLONE */ +struct zufs_ioc_clone { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *src_zus_ii; + struct zus_inode_info *dst_zus_ii; + __u64 pos_in, pos_out; + __u64 len; +}; + +/* ZUS_OP_LLSEEK */ +struct zufs_ioc_seek { + struct zufs_ioc_hdr hdr; + /* IN */ + struct zus_inode_info *zus_ii; + __u64 offset_in; + __u32 whence; + __u32 pad; + + /* OUT */ + __u64 offset_out; +}; + #endif /* _LINUX_ZUFS_API_H */ -- 2.5.5