All of lore.kernel.org
 help / color / mirror / Atom feed
From: Khalid Aziz <khalid.aziz@oracle.com>
To: akpm@linux-foundation.org, willy@infradead.org,
	longpeng2@huawei.com, arnd@arndb.de, dave.hansen@linux.intel.com,
	david@redhat.com, rppt@kernel.org, surenb@google.com,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Khalid Aziz <khalid.aziz@oracle.com>
Subject: [RFC PATCH 2/6] mm: Add msharefs filesystem
Date: Tue, 18 Jan 2022 14:19:14 -0700	[thread overview]
Message-ID: <32784ee26d895bae2484e15fef205d5590720c4b.1642526745.git.khalid.aziz@oracle.com> (raw)
In-Reply-To: <cover.1642526745.git.khalid.aziz@oracle.com>

Add a ram-based filesystem that contains the files created for each
shared address range. This patch adds just the filesystem and creation
of files. Page table entries for these shared ranges created by mshare
syscall are still not shared.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
---
 Documentation/filesystems/msharefs.rst |  19 +++
 include/uapi/linux/magic.h             |   1 +
 mm/mshare.c                            | 191 +++++++++++++++++++++++--
 3 files changed, 197 insertions(+), 14 deletions(-)
 create mode 100644 Documentation/filesystems/msharefs.rst

diff --git a/Documentation/filesystems/msharefs.rst b/Documentation/filesystems/msharefs.rst
new file mode 100644
index 000000000000..fd161f67045d
--- /dev/null
+++ b/Documentation/filesystems/msharefs.rst
@@ -0,0 +1,19 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================================
+msharefs - a filesystem to support shared page tables
+=====================================================
+
+msharefs is a ram-based filesystem that allows multiple processes to
+share page table entries for shared pages.
+
+msharefs is typically mounted like this::
+
+	mount -t msharefs none /sys/fs/mshare
+
+When a process calls mshare syscall with a name for the shared address
+range, a file with the same name is created under msharefs with that
+name. This file can be opened by another process, if permissions
+allow, to query the addresses shared under this range. These files are
+removed by mshare_unlink syscall and can not be deleted directly.
+Hence these files are created as immutable files.
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 35687dcb1a42..26a12e33a3c1 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -98,5 +98,6 @@
 #define Z3FOLD_MAGIC		0x33
 #define PPC_CMM_MAGIC		0xc7571590
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
+#define MSHARE_MAGIC		0x4d534852	/* "MSHR" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/mshare.c b/mm/mshare.c
index c723f8369f06..e48d0f615f9f 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -10,20 +10,117 @@
  *		Khalid Aziz
  */
 
-#include <linux/anon_inodes.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/pseudo_fs.h>
+#include <linux/fileattr.h>
+#include <uapi/linux/magic.h>
+#include <uapi/linux/limits.h>
 
-static const struct file_operations mshare_fops = {
+static struct super_block *msharefs_sb;
+
+static const struct file_operations msharefs_file_operations = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
 };
 
+static int
+msharefs_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	unsigned long hash = init_name_hash(dentry);
+	const unsigned char *s = qstr->name;
+	unsigned int len = qstr->len;
+
+	while (len--)
+		hash = partial_name_hash(*s++, hash);
+	qstr->hash = end_name_hash(hash);
+	return 0;
+}
+
+static struct dentry
+*msharefs_alloc_dentry(struct dentry *parent, const char *name)
+{
+	struct dentry *d;
+	struct qstr q;
+	int err;
+
+	q.name = name;
+	q.len = strlen(name);
+
+	err = msharefs_d_hash(parent, &q);
+	if (err)
+		return ERR_PTR(err);
+
+	d = d_alloc(parent, &q);
+	if (d)
+		return d;
+
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct inode
+*msharefs_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+
+		/*
+		 * msharefs are not meant to be manipulated from userspace.
+		 * Reading from the file is the only allowed operation
+		 */
+		inode->i_flags = S_IMMUTABLE;
+
+		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+		inode->i_fop = &msharefs_file_operations;
+		inode->i_size = 0;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = current_fsgid();
+	}
+
+	return inode;
+}
+
+static int
+mshare_file_create(const char *name, unsigned long flags)
+{
+	struct inode *inode;
+	struct dentry *root, *dentry;
+	int err = 0;
+
+	root = msharefs_sb->s_root;
+
+	inode = msharefs_get_inode(msharefs_sb, S_IFREG | 0400);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	dentry = msharefs_alloc_dentry(root, name);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto fail_inode;
+	}
+
+	d_add(dentry, inode);
+
+	return err;
+
+fail_inode:
+	iput(inode);
+	return err;
+}
+
 /*
- * mshare syscall. Returns a file descriptor
+ * mshare syscall
  */
-SYSCALL_DEFINE5(mshare, const char *, name, unsigned long, addr,
+SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
 		unsigned long, len, int, oflag, mode_t, mode)
 {
-	int fd;
+	char mshare_name[NAME_MAX];
+	int err;
 
 	/*
 	 * Address range being shared must be aligned to pgdir
@@ -32,15 +129,14 @@ SYSCALL_DEFINE5(mshare, const char *, name, unsigned long, addr,
 	if ((addr | len) & (PGDIR_SIZE - 1))
 		return -EINVAL;
 
-	/*
-	 * Allocate a file descriptor to return
-	 *
-	 * TODO: This code ignores the object name completely. Add
-	 * support for that
-	 */
-	fd = anon_inode_getfd("mshare", &mshare_fops, NULL, O_RDWR);
+	err = copy_from_user(mshare_name, name, NAME_MAX);
+	if (err)
+		goto err_out;
 
-	return fd;
+	err = mshare_file_create(mshare_name, oflag);
+
+err_out:
+	return err;
 }
 
 /*
@@ -48,7 +144,8 @@ SYSCALL_DEFINE5(mshare, const char *, name, unsigned long, addr,
  */
 SYSCALL_DEFINE1(mshare_unlink, const char *, name)
 {
-	int fd;
+	char mshare_name[NAME_MAX];
+	int err;
 
 	/*
 	 * Delete the named object
@@ -56,5 +153,71 @@ SYSCALL_DEFINE1(mshare_unlink, const char *, name)
 	 * TODO: Mark mshare'd range for deletion
 	 *
 	 */
+	err = copy_from_user(mshare_name, name, NAME_MAX);
+	if (err)
+		goto err_out;
+	return 0;
+
+err_out:
+	return err;
+}
+
+static const struct dentry_operations msharefs_d_ops = {
+	.d_hash = msharefs_d_hash,
+};
+
+static int
+msharefs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+	static const struct tree_descr empty_descr = {""};
+	int err;
+
+	sb->s_d_op = &msharefs_d_ops;
+	err = simple_fill_super(sb, MSHARE_MAGIC, &empty_descr);
+	if (err)
+		return err;
+
+	msharefs_sb = sb;
+	return 0;
+}
+
+static int
+msharefs_get_tree(struct fs_context *fc)
+{
+	return get_tree_single(fc, msharefs_fill_super);
+}
+
+static const struct fs_context_operations msharefs_context_ops = {
+	.get_tree	= msharefs_get_tree,
+};
+
+static int
+mshare_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &msharefs_context_ops;
 	return 0;
 }
+
+static struct file_system_type mshare_fs = {
+	.name			= "msharefs",
+	.init_fs_context	= mshare_init_fs_context,
+	.kill_sb		= kill_litter_super,
+};
+
+static int
+mshare_init(void)
+{
+	int ret = 0;
+
+	ret = sysfs_create_mount_point(fs_kobj, "mshare");
+	if (ret)
+		return ret;
+
+	ret = register_filesystem(&mshare_fs);
+	if (ret)
+		sysfs_remove_mount_point(fs_kobj, "mshare");
+
+	return ret;
+}
+
+fs_initcall(mshare_init);
-- 
2.32.0


  parent reply	other threads:[~2022-01-18 21:21 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-18 21:19 [RFC PATCH 0/6] Add support for shared PTEs across processes Khalid Aziz
2022-01-18 21:19 ` [RFC PATCH 1/6] mm: Add new system calls mshare, mshare_unlink Khalid Aziz
2022-01-18 21:19 ` Khalid Aziz [this message]
2022-01-18 21:19 ` [RFC PATCH 3/6] mm: Add read for msharefs Khalid Aziz
2022-01-18 21:19 ` [RFC PATCH 4/6] mm: implement mshare_unlink syscall Khalid Aziz
2022-01-18 21:19 ` [RFC PATCH 5/6] mm: Add locking to msharefs syscalls Khalid Aziz
2022-01-18 21:19 ` [RFC PATCH 6/6] mm: Add basic page table sharing using mshare Khalid Aziz
2022-01-18 21:41 ` [RFC PATCH 0/6] Add support for shared PTEs across processes Dave Hansen
2022-01-18 21:46   ` Matthew Wilcox
2022-01-18 22:47     ` Khalid Aziz
2022-01-18 22:06 ` Dave Hansen
2022-01-18 22:52   ` Khalid Aziz
2022-01-19 11:38 ` Mark Hemment
2022-01-19 17:02   ` Khalid Aziz
2022-01-20 12:49     ` Mark Hemment
2022-01-20 19:15       ` Khalid Aziz
2022-01-24 15:15         ` Mark Hemment
2022-01-24 15:27           ` Matthew Wilcox
2022-01-24 22:20           ` Khalid Aziz
2022-01-21  1:08 ` Barry Song
2022-01-21  2:13   ` Matthew Wilcox
2022-01-21  7:35     ` Barry Song
2022-01-21 14:47       ` Matthew Wilcox
2022-01-21 16:41         ` Khalid Aziz
2022-01-22  1:39           ` Longpeng (Mike, Cloud Infrastructure Service Product Dept.)
2022-01-22  1:41             ` Matthew Wilcox
2022-01-22 10:18               ` Thomas Schoebel-Theuer
2022-01-22 16:09                 ` Matthew Wilcox
2022-01-22 11:31 ` Mike Rapoport
2022-01-22 18:29   ` Andy Lutomirski
2022-01-24 18:48   ` Khalid Aziz
2022-01-24 19:45     ` Andy Lutomirski
2022-01-24 22:30       ` Khalid Aziz
2022-01-24 23:16         ` Andy Lutomirski
2022-01-24 23:44           ` Khalid Aziz
2022-01-25 11:42 ` Kirill A. Shutemov
2022-01-25 12:09   ` William Kucharski
2022-01-25 13:18     ` David Hildenbrand
2022-01-25 14:01       ` Kirill A. Shutemov
2022-01-25 13:23   ` Matthew Wilcox
2022-01-25 13:59     ` Kirill A. Shutemov
2022-01-25 14:09       ` Matthew Wilcox
2022-01-25 18:57         ` Kirill A. Shutemov
2022-01-25 18:59           ` Matthew Wilcox
2022-01-26  4:04             ` Matthew Wilcox
2022-01-26 10:16               ` David Hildenbrand
2022-01-26 13:38                 ` Matthew Wilcox
2022-01-26 13:55                   ` David Hildenbrand
2022-01-26 14:12                     ` Matthew Wilcox
2022-01-26 14:30                       ` David Hildenbrand
2022-01-26 14:12                   ` Mike Rapoport
2022-01-26 13:42               ` Kirill A. Shutemov
2022-01-26 14:18                 ` Mike Rapoport
2022-01-26 17:33                   ` Khalid Aziz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=32784ee26d895bae2484e15fef205d5590720c4b.1642526745.git.khalid.aziz@oracle.com \
    --to=khalid.aziz@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longpeng2@huawei.com \
    --cc=rppt@kernel.org \
    --cc=surenb@google.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.