linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: yulei.kernel@gmail.com
To: akpm@linux-foundation.org, naoya.horiguchi@nec.com,
	viro@zeniv.linux.org.uk, pbonzini@redhat.com
Cc: linux-fsdevel@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, xiaoguangrong.eric@gmail.com,
	kernellwp@gmail.com, lihaiwei.kernel@gmail.com,
	Yulei Zhang <yuleixzhang@tencent.com>,
	Xiao Guangrong <gloryxiao@tencent.com>
Subject: [PATCH 01/35] fs: introduce dmemfs module
Date: Thu,  8 Oct 2020 15:53:51 +0800	[thread overview]
Message-ID: <aa553faf9e97ee9306ecd5a67d3324a34f9ed4be.1602093760.git.yuleixzhang@tencent.com> (raw)
In-Reply-To: <cover.1602093760.git.yuleixzhang@tencent.com>
In-Reply-To: <cover.1602093760.git.yuleixzhang@tencent.com>

From: Yulei Zhang <yuleixzhang@tencent.com>

dmemfs (Direct Memory filesystem) is device memory or reserved
memory based filesystem. This kind of memory is special as it
is not managed by kernel and it is without 'struct page'.

The original purpose of dmemfs is to drop the usage of
'struct page' to save extra system memory.

This patch introduces the basic framework of dmemfs and only
mkdir and create regular file are supported.

Signed-off-by: Xiao Guangrong  <gloryxiao@tencent.com>
Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com>
---
 fs/Kconfig                 |   1 +
 fs/Makefile                |   1 +
 fs/dmemfs/Kconfig          |  13 ++
 fs/dmemfs/Makefile         |   7 +
 fs/dmemfs/inode.c          | 275 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/magic.h |   1 +
 6 files changed, 298 insertions(+)
 create mode 100644 fs/dmemfs/Kconfig
 create mode 100644 fs/dmemfs/Makefile
 create mode 100644 fs/dmemfs/inode.c

diff --git a/fs/Kconfig b/fs/Kconfig
index aa4c12282301..18e72089426f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -41,6 +41,7 @@ source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
 source "fs/zonefs/Kconfig"
+source "fs/dmemfs/Kconfig"
 
 config FS_DAX
 	bool "Direct Access (DAX) support"
diff --git a/fs/Makefile b/fs/Makefile
index 1c7b0e3f6daa..10e0302c5902 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -136,3 +136,4 @@ obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
 obj-$(CONFIG_EROFS_FS)		+= erofs/
 obj-$(CONFIG_VBOXSF_FS)		+= vboxsf/
 obj-$(CONFIG_ZONEFS_FS)		+= zonefs/
+obj-$(CONFIG_DMEM_FS)		+= dmemfs/
diff --git a/fs/dmemfs/Kconfig b/fs/dmemfs/Kconfig
new file mode 100644
index 000000000000..d2894a513de0
--- /dev/null
+++ b/fs/dmemfs/Kconfig
@@ -0,0 +1,13 @@
+config DMEM_FS
+	tristate "Direct Memory filesystem support"
+	help
+	  dmemfs (Direct Memory filesystem) is device memory or reserved
+	  memory based filesystem. This kind of memory is special as it
+	  is not managed by kernel and it is without 'struct page'.
+
+	  The original purpose of dmemfs is saving extra memory of
+	  'struct page' that reduces the total cost of ownership (TCO)
+	  for cloud providers.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called dmemfs.
diff --git a/fs/dmemfs/Makefile b/fs/dmemfs/Makefile
new file mode 100644
index 000000000000..73bdc9cbc87e
--- /dev/null
+++ b/fs/dmemfs/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux dmem-filesystem routines.
+#
+obj-$(CONFIG_DMEM_FS) += dmemfs.o
+
+dmemfs-y += inode.o
diff --git a/fs/dmemfs/inode.c b/fs/dmemfs/inode.c
new file mode 100644
index 000000000000..6a8a2d9f94e9
--- /dev/null
+++ b/fs/dmemfs/inode.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  linux/fs/dmemfs/inode.c
+ *
+ * Authors:
+ *   Xiao Guangrong  <gloryxiao@tencent.com>
+ *   Chen Zhuo	     <sagazchen@tencent.com>
+ *   Haiwei Li	     <gerryhwli@tencent.com>
+ *   Yulei Zhang     <yuleixzhang@tencent.com>
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/magic.h>
+#include <linux/mman.h>
+#include <linux/statfs.h>
+#include <linux/pagemap.h>
+#include <linux/parser.h>
+#include <linux/pfn_t.h>
+#include <linux/pagevec.h>
+#include <linux/fs_parser.h>
+#include <linux/seq_file.h>
+
+MODULE_AUTHOR("Tencent Corporation");
+MODULE_LICENSE("GPL v2");
+
+struct dmemfs_mount_opts {
+	unsigned long dpage_size;
+};
+
+struct dmemfs_fs_info {
+	struct dmemfs_mount_opts mount_opts;
+};
+
+enum dmemfs_param {
+	Opt_dpagesize,
+};
+
+const struct fs_parameter_spec dmemfs_fs_parameters[] = {
+	fsparam_string("pagesize", Opt_dpagesize),
+	{}
+};
+
+static int check_dpage_size(unsigned long dpage_size)
+{
+	if (dpage_size != PAGE_SIZE && dpage_size != PMD_SIZE &&
+	      dpage_size != PUD_SIZE)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct inode *
+dmemfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode,
+		 dev_t dev);
+
+static int
+dmemfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+	struct inode *inode = dmemfs_get_inode(dir->i_sb, dir, mode, dev);
+	int error = -ENOSPC;
+
+	if (inode) {
+		d_instantiate(dentry, inode);
+		dget(dentry);	/* Extra count - pin the dentry in core */
+		error = 0;
+		dir->i_mtime = dir->i_ctime = current_time(inode);
+	}
+	return error;
+}
+
+static int dmemfs_create(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, bool excl)
+{
+	return dmemfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
+			umode_t mode)
+{
+	int retval = dmemfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+
+	if (!retval)
+		inc_nlink(dir);
+	return retval;
+}
+
+static const struct inode_operations dmemfs_dir_inode_operations = {
+	.create		= dmemfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= simple_unlink,
+	.mkdir		= dmemfs_mkdir,
+	.rmdir		= simple_rmdir,
+	.rename		= simple_rename,
+};
+
+static const struct inode_operations dmemfs_file_inode_operations = {
+	.setattr = simple_setattr,
+	.getattr = simple_getattr,
+};
+
+int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return 0;
+}
+
+static const struct file_operations dmemfs_file_operations = {
+	.mmap = dmemfs_file_mmap,
+};
+
+static int dmemfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct dmemfs_fs_info *fsi = fc->s_fs_info;
+	struct fs_parse_result result;
+	int opt, ret;
+
+	opt = fs_parse(fc, dmemfs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_dpagesize:
+		fsi->mount_opts.dpage_size = memparse(param->string, NULL);
+		ret = check_dpage_size(fsi->mount_opts.dpage_size);
+		if (ret) {
+			pr_warn("dmemfs: unknown pagesize %x.\n",
+				result.uint_32);
+			return ret;
+		}
+		break;
+	default:
+		pr_warn("dmemfs: unknown mount option [%x].\n",
+			opt);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct inode *dmemfs_get_inode(struct super_block *sb,
+			       const struct inode *dir, umode_t mode, dev_t dev)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode_init_owner(inode, dir, mode);
+		inode->i_mapping->a_ops = &empty_aops;
+		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+		mapping_set_unevictable(inode->i_mapping);
+		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+		switch (mode & S_IFMT) {
+		default:
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_op = &dmemfs_file_inode_operations;
+			inode->i_fop = &dmemfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &dmemfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+
+			/*
+			 * directory inodes start off with i_nlink == 2
+			 * (for "." entry)
+			 */
+			inc_nlink(inode);
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			break;
+		}
+	}
+	return inode;
+}
+
+static int dmemfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	simple_statfs(dentry, buf);
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+
+	return 0;
+}
+
+static const struct super_operations dmemfs_ops = {
+	.statfs	= dmemfs_statfs,
+	.drop_inode = generic_delete_inode,
+};
+
+static int
+dmemfs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	struct inode *inode;
+	struct dmemfs_fs_info *fsi = sb->s_fs_info;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = fsi->mount_opts.dpage_size;
+	sb->s_blocksize_bits = ilog2(fsi->mount_opts.dpage_size);
+	sb->s_magic = DMEMFS_MAGIC;
+	sb->s_op = &dmemfs_ops;
+	sb->s_time_gran = 1;
+
+	inode = dmemfs_get_inode(sb, NULL, S_IFDIR, 0);
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int dmemfs_get_tree(struct fs_context *fc)
+{
+	return get_tree_nodev(fc, dmemfs_fill_super);
+}
+
+static void dmemfs_free_fc(struct fs_context *fc)
+{
+	kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations dmemfs_context_ops = {
+	.free		= dmemfs_free_fc,
+	.parse_param	= dmemfs_parse_param,
+	.get_tree	= dmemfs_get_tree,
+};
+
+int dmemfs_init_fs_context(struct fs_context *fc)
+{
+	struct dmemfs_fs_info *fsi;
+
+	fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
+	if (!fsi)
+		return -ENOMEM;
+
+	fsi->mount_opts.dpage_size = PAGE_SIZE;
+	fc->s_fs_info = fsi;
+	fc->ops = &dmemfs_context_ops;
+	return 0;
+}
+
+static void dmemfs_kill_sb(struct super_block *sb)
+{
+	kill_litter_super(sb);
+}
+
+static struct file_system_type dmemfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "dmemfs",
+	.init_fs_context = dmemfs_init_fs_context,
+	.kill_sb	= dmemfs_kill_sb,
+};
+
+static int __init dmemfs_init(void)
+{
+	int ret;
+
+	ret = register_filesystem(&dmemfs_fs_type);
+
+	return ret;
+}
+
+static void __exit dmemfs_uninit(void)
+{
+	unregister_filesystem(&dmemfs_fs_type);
+}
+
+module_init(dmemfs_init)
+module_exit(dmemfs_uninit)
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index f3956fc11de6..3fbd06661c8c 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -97,5 +97,6 @@
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define Z3FOLD_MAGIC		0x33
 #define PPC_CMM_MAGIC		0xc7571590
+#define DMEMFS_MAGIC		0x2ace90c6
 
 #endif /* __LINUX_MAGIC_H__ */
-- 
2.28.0


  reply	other threads:[~2020-10-08  7:53 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-08  7:53 [PATCH 00/35] Enhance memory utilization with DMEMFS yulei.kernel
2020-10-08  7:53 ` yulei.kernel [this message]
2020-11-10 20:04   ` [PATCH 01/35] fs: introduce dmemfs module Al Viro
2020-11-11  8:53     ` yulei zhang
2020-11-11 23:09       ` Al Viro
2020-11-12 10:03         ` yulei zhang
2020-10-08  7:53 ` [PATCH 02/35] mm: support direct memory reservation yulei.kernel
2020-10-08 20:27   ` Randy Dunlap
2020-10-08 20:34   ` Randy Dunlap
2020-10-08  7:53 ` [PATCH 03/35] dmem: implement dmem memory management yulei.kernel
2020-10-08  7:53 ` [PATCH 04/35] dmem: let pat recognize dmem yulei.kernel
2020-10-13  7:27   ` Paolo Bonzini
2020-10-13  9:53     ` yulei zhang
2020-10-08  7:53 ` [PATCH 05/35] dmemfs: support mmap yulei.kernel
2020-10-08  7:53 ` [PATCH 06/35] dmemfs: support truncating inode down yulei.kernel
2020-10-08  7:53 ` [PATCH 07/35] dmem: trace core functions yulei.kernel
2020-10-08  7:53 ` [PATCH 08/35] dmem: show some statistic in debugfs yulei.kernel
2020-10-08 20:23   ` Randy Dunlap
2020-10-09 11:49     ` yulei zhang
2020-10-08  7:53 ` [PATCH 09/35] dmemfs: support remote access yulei.kernel
2020-10-08  7:54 ` [PATCH 10/35] dmemfs: introduce max_alloc_try_dpages parameter yulei.kernel
2020-10-08  7:54 ` [PATCH 11/35] mm: export mempolicy interfaces to serve dmem allocator yulei.kernel
2020-10-08  7:54 ` [PATCH 12/35] dmem: introduce mempolicy support yulei.kernel
2020-10-08  7:54 ` [PATCH 13/35] mm, dmem: introduce PFN_DMEM and pfn_t_dmem yulei.kernel
2020-10-08  7:54 ` [PATCH 14/35] mm, dmem: dmem-pmd vs thp-pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 15/35] mm: add pmd_special() check for pmd_trans_huge_lock() yulei.kernel
2020-10-08  7:54 ` [PATCH 16/35] dmemfs: introduce ->split() to dmemfs_vm_ops yulei.kernel
2020-10-08  7:54 ` [PATCH 17/35] mm, dmemfs: support unmap_page_range() for dmemfs pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 18/35] mm: follow_pmd_mask() for dmem huge pmd yulei.kernel
2020-10-08  7:54 ` [PATCH 19/35] mm: gup_huge_pmd() " yulei.kernel
2020-10-08  7:54 ` [PATCH 20/35] mm: support dmem huge pmd for vmf_insert_pfn_pmd() yulei.kernel
2020-10-08  7:54 ` [PATCH 21/35] mm: support dmem huge pmd for follow_pfn() yulei.kernel
2020-10-08  7:54 ` [PATCH 22/35] kvm, x86: Distinguish dmemfs page from mmio page yulei.kernel
2020-10-09  0:58   ` Sean Christopherson
2020-10-09 10:28     ` Joao Martins
2020-10-09 11:42       ` yulei zhang
2020-10-08  7:54 ` [PATCH 23/35] kvm, x86: introduce VM_DMEM yulei.kernel
2020-10-08  7:54 ` [PATCH 24/35] dmemfs: support hugepage for dmemfs yulei.kernel
2020-10-08  7:54 ` [PATCH 25/35] mm, x86, dmem: fix estimation of reserved page for vaddr_get_pfn() yulei.kernel
2020-10-08  7:54 ` [PATCH 26/35] mm, dmem: introduce pud_special() yulei.kernel
2020-10-08  7:54 ` [PATCH 27/35] mm: add pud_special() to support dmem huge pud yulei.kernel
2020-10-08  7:54 ` [PATCH 28/35] mm, dmemfs: support huge_fault() for dmemfs yulei.kernel
2020-10-08  7:54 ` [PATCH 29/35] mm: add follow_pte_pud() yulei.kernel
2020-10-08  7:54 ` [PATCH 30/35] dmem: introduce dmem_bitmap_alloc() and dmem_bitmap_free() yulei.kernel
2020-10-08  7:54 ` [PATCH 31/35] dmem: introduce mce handler yulei.kernel
2020-10-08  7:54 ` [PATCH 32/35] mm, dmemfs: register and handle the dmem mce yulei.kernel
2020-10-08  7:54 ` [PATCH 33/35] kvm, x86: temporary disable record_steal_time for dmem yulei.kernel
2020-10-08  7:54 ` [PATCH 34/35] dmem: add dmem unit tests yulei.kernel
2020-10-08  7:54 ` [PATCH 35/35] Add documentation for dmemfs yulei.kernel
2020-10-09  1:26   ` Randy Dunlap
2020-10-08 19:01 ` [PATCH 00/35] Enhance memory utilization with DMEMFS Joao Martins
2020-10-09 11:39   ` yulei zhang
2020-10-09 11:53     ` Joao Martins
2020-10-10  8:15       ` yulei zhang
2020-10-12 10:59         ` Joao Martins
2020-10-14 22:25           ` Dan Williams
2020-10-19 13:37             ` Paolo Bonzini
2020-10-19 19:03               ` Joao Martins
2020-10-20 15:22                 ` yulei zhang
2020-10-12 11:57 ` Zengtao (B)
2020-10-13  2:45   ` yulei zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aa553faf9e97ee9306ecd5a67d3324a34f9ed4be.1602093760.git.yuleixzhang@tencent.com \
    --to=yulei.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=gloryxiao@tencent.com \
    --cc=kernellwp@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=lihaiwei.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=naoya.horiguchi@nec.com \
    --cc=pbonzini@redhat.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xiaoguangrong.eric@gmail.com \
    --cc=yuleixzhang@tencent.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).