linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Khalid Aziz <khalid.aziz@oracle.com>
To: akpm@linux-foundation.org, willy@infradead.org
Cc: Khalid Aziz <khalid.aziz@oracle.com>,
	aneesh.kumar@linux.ibm.com, arnd@arndb.de, 21cnbao@gmail.com,
	corbet@lwn.net, dave.hansen@linux.intel.com, david@redhat.com,
	ebiederm@xmission.com, hagen@jauu.net, jack@suse.cz,
	keescook@chromium.org, kirill@shutemov.name, kucharsk@gmail.com,
	linkinjeon@kernel.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	longpeng2@huawei.com, luto@kernel.org, markhemm@googlemail.com,
	pcc@google.com, rppt@kernel.org, sieberf@amazon.com,
	sjpark@amazon.de, surenb@google.com, tst@schoebel-theuer.de,
	yzaikin@google.com
Subject: [PATCH v1 08/14] mm/mshare: Add basic page table sharing using mshare
Date: Mon, 11 Apr 2022 10:05:52 -0600	[thread overview]
Message-ID: <5c96dd165d7ec3da14306b8fd857c7eb95a8c3e6.1649370874.git.khalid.aziz@oracle.com> (raw)
In-Reply-To: <cover.1649370874.git.khalid.aziz@oracle.com>

This patch adds basic page table sharing across tasks by making
mshare syscall. It does this by creating a new mm_struct which
hosts the shared vmas and page tables. This mm_struct is
maintained as long as there is at least one task using the mshare'd
range. It is cleaned up by the last mshare_unlink syscall.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/internal.h |   2 +
 mm/memory.c   |  35 ++++++++++
 mm/mshare.c   | 190 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 214 insertions(+), 13 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index cf50a471384e..68f82f0f8b66 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -718,6 +718,8 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 		      unsigned long addr, int page_nid, int *flags);
 
+extern vm_fault_t find_shared_vma(struct vm_area_struct **vma,
+			unsigned long *addrp);
 static inline bool vma_is_shared(const struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_SHARED_PT;
diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913..c77c0d643ea8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4776,6 +4776,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 			   unsigned int flags, struct pt_regs *regs)
 {
 	vm_fault_t ret;
+	bool shared = false;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -4785,6 +4786,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	/* do counter updates before entering really critical section. */
 	check_sync_rss_stat(current);
 
+	if (unlikely(vma_is_shared(vma))) {
+		ret = find_shared_vma(&vma, &address);
+		if (ret)
+			return ret;
+		if (!vma)
+			return VM_FAULT_SIGSEGV;
+		shared = true;
+	}
+
 	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
 					    flags & FAULT_FLAG_INSTRUCTION,
 					    flags & FAULT_FLAG_REMOTE))
@@ -4802,6 +4812,31 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		ret = __handle_mm_fault(vma, address, flags);
 
+	/*
+	 * Release the read lock on shared VMA's parent mm unless
+	 * __handle_mm_fault released the lock already.
+	 * __handle_mm_fault sets VM_FAULT_RETRY in return value if
+	 * it released mmap lock. If lock was released, that implies
+	 * the lock would have been released on task's original mm if
+	 * this were not a shared PTE vma. To keep lock state consistent,
+	 * make sure to release the lock on task's original mm
+	 */
+	if (shared) {
+		int release_mmlock = 1;
+
+		if (!(ret & VM_FAULT_RETRY)) {
+			mmap_read_unlock(vma->vm_mm);
+			release_mmlock = 0;
+		} else if ((flags & FAULT_FLAG_ALLOW_RETRY) &&
+			(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			mmap_read_unlock(vma->vm_mm);
+			release_mmlock = 0;
+		}
+
+		if (release_mmlock)
+			mmap_read_unlock(current->mm);
+	}
+
 	if (flags & FAULT_FLAG_USER) {
 		mem_cgroup_exit_user_fault();
 		/*
diff --git a/mm/mshare.c b/mm/mshare.c
index cd2f7ad24d9d..d1896adcb00f 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -17,18 +17,49 @@
 #include <linux/pseudo_fs.h>
 #include <linux/fileattr.h>
 #include <linux/refcount.h>
+#include <linux/mman.h>
 #include <linux/sched/mm.h>
 #include <uapi/linux/magic.h>
 #include <uapi/linux/limits.h>
 
 struct mshare_data {
-	struct mm_struct *mm;
+	struct mm_struct *mm, *host_mm;
 	mode_t mode;
 	refcount_t refcnt;
 };
 
 static struct super_block *msharefs_sb;
 
+/* Returns holding the host mm's lock for read.  Caller must release. */
+vm_fault_t
+find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp)
+{
+	struct vm_area_struct *vma, *guest = *vmap;
+	struct mshare_data *info = guest->vm_private_data;
+	struct mm_struct *host_mm = info->mm;
+	unsigned long host_addr;
+	pgd_t *pgd, *guest_pgd;
+
+	host_addr = *addrp - guest->vm_start + host_mm->mmap_base;
+	pgd = pgd_offset(host_mm, host_addr);
+	guest_pgd = pgd_offset(current->mm, *addrp);
+	if (!pgd_same(*guest_pgd, *pgd)) {
+		set_pgd(guest_pgd, *pgd);
+		return VM_FAULT_NOPAGE;
+	}
+
+	*addrp = host_addr;
+	mmap_read_lock(host_mm);
+	vma = find_vma(host_mm, host_addr);
+
+	/* XXX: expand stack? */
+	if (vma && vma->vm_start > host_addr)
+		vma = NULL;
+
+	*vmap = vma;
+	return 0;
+}
+
 static void
 msharefs_evict_inode(struct inode *inode)
 {
@@ -169,11 +200,13 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
 		unsigned long, len, int, oflag, mode_t, mode)
 {
 	struct mshare_data *info;
-	struct mm_struct *mm;
 	struct filename *fname = getname(name);
 	struct dentry *dentry;
 	struct inode *inode;
 	struct qstr namestr;
+	struct vm_area_struct *vma, *next, *new_vma;
+	struct mm_struct *new_mm;
+	unsigned long end;
 	int err = PTR_ERR(fname);
 
 	/*
@@ -193,6 +226,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
 	if (IS_ERR(fname))
 		goto err_out;
 
+	end = addr + len;
+
 	/*
 	 * Does this mshare entry exist already? If it does, calling
 	 * mshare with O_EXCL|O_CREAT is an error
@@ -205,49 +240,165 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
 	inode_lock(d_inode(msharefs_sb->s_root));
 	dentry = d_lookup(msharefs_sb->s_root, &namestr);
 	if (dentry && (oflag & (O_EXCL|O_CREAT))) {
+		inode = d_inode(dentry);
 		err = -EEXIST;
 		dput(dentry);
 		goto err_unlock_inode;
 	}
 
 	if (dentry) {
+		unsigned long mapaddr, prot = PROT_NONE;
+
 		inode = d_inode(dentry);
 		if (inode == NULL) {
+			mmap_write_unlock(current->mm);
 			err = -EINVAL;
 			goto err_out;
 		}
 		info = inode->i_private;
-		refcount_inc(&info->refcnt);
 		dput(dentry);
+
+		/*
+		 * Map in the address range as anonymous mappings
+		 */
+		oflag &= (O_RDONLY | O_WRONLY | O_RDWR);
+		if (oflag & O_RDONLY)
+			prot |= PROT_READ;
+		else if (oflag & O_WRONLY)
+			prot |= PROT_WRITE;
+		else if (oflag & O_RDWR)
+			prot |= (PROT_READ | PROT_WRITE);
+		mapaddr = vm_mmap(NULL, addr, len, prot,
+				MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0);
+		if (IS_ERR((void *)mapaddr)) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		refcount_inc(&info->refcnt);
+
+		/*
+		 * Now that we have mmap'd the mshare'd range, update vma
+		 * flags and vm_mm pointer for this mshare'd range.
+		 */
+		mmap_write_lock(current->mm);
+		vma = find_vma(current->mm, addr);
+		if (vma && vma->vm_start < addr) {
+			mmap_write_unlock(current->mm);
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		while (vma && vma->vm_start < (addr + len)) {
+			vma->vm_private_data = info;
+			vma->vm_mm = info->mm;
+			vma->vm_flags |= VM_SHARED_PT;
+			next = vma->vm_next;
+			vma = next;
+		}
 	} else {
-		mm = mm_alloc();
-		if (!mm)
+		unsigned long myaddr;
+		struct mm_struct *old_mm;
+
+		old_mm = current->mm;
+		new_mm = mm_alloc();
+		if (!new_mm)
 			return -ENOMEM;
 		info = kzalloc(sizeof(*info), GFP_KERNEL);
 		if (!info) {
 			err = -ENOMEM;
 			goto err_relmm;
 		}
-		mm->mmap_base = addr;
-		mm->task_size = addr + len;
-		if (!mm->task_size)
-			mm->task_size--;
-		info->mm = mm;
+		new_mm->mmap_base = addr;
+		new_mm->task_size = addr + len;
+		if (!new_mm->task_size)
+			new_mm->task_size--;
+		info->mm = new_mm;
+		info->host_mm = old_mm;
 		info->mode = mode;
 		refcount_set(&info->refcnt, 1);
+
+		/*
+		 * VMAs for this address range may or may not exist.
+		 * If VMAs exist, they should be marked as shared at
+		 * this point and page table info should be copied
+		 * over to newly created mm_struct. TODO: If VMAs do not
+		 * exist, create them and mark them as shared.
+		 */
+		mmap_write_lock(old_mm);
+		vma = find_vma_intersection(old_mm, addr, end);
+		if (!vma) {
+			err = -EINVAL;
+			goto unlock;
+		}
+		/*
+		 * TODO: If the currently allocated VMA goes beyond the
+		 * mshare'd range, this VMA needs to be split.
+		 *
+		 * Double check that source VMAs do not extend outside
+		 * the range
+		 */
+		vma = find_vma(old_mm, addr + len);
+		if (vma && vma->vm_start < (addr + len)) {
+			err = -EINVAL;
+			goto unlock;
+		}
+
+		vma = find_vma(old_mm, addr);
+		if (vma && vma->vm_start < addr) {
+			err = -EINVAL;
+			goto unlock;
+		}
+
+		mmap_write_lock(new_mm);
+		while (vma && vma->vm_start < (addr + len)) {
+			/*
+			 * Copy this vma over to host mm
+			 */
+			vma->vm_private_data = info;
+			vma->vm_mm = new_mm;
+			vma->vm_flags |= VM_SHARED_PT;
+			new_vma = vm_area_dup(vma);
+			if (!new_vma) {
+				err = -ENOMEM;
+				goto unlock;
+			}
+			err = insert_vm_struct(new_mm, new_vma);
+			if (err)
+				goto unlock;
+
+			vma = vma->vm_next;
+		}
+		mmap_write_unlock(new_mm);
+
 		err = mshare_file_create(fname, oflag, info);
 		if (err)
-			goto err_relinfo;
+			goto unlock;
+
+		/*
+		 * Copy over current PTEs
+		 */
+		myaddr = addr;
+		while (myaddr < new_mm->task_size) {
+			*pgd_offset(new_mm, myaddr) = *pgd_offset(old_mm, myaddr);
+			myaddr += PGDIR_SIZE;
+		}
+		/*
+		 * TODO: Free the corresponding page table in calling
+		 * process
+		 */
 	}
 
+	mmap_write_unlock(current->mm);
 	inode_unlock(d_inode(msharefs_sb->s_root));
 	putname(fname);
 	return 0;
 
-err_relinfo:
+unlock:
+	mmap_write_unlock(current->mm);
 	kfree(info);
 err_relmm:
-	mmput(mm);
+	mmput(new_mm);
 err_unlock_inode:
 	inode_unlock(d_inode(msharefs_sb->s_root));
 err_out:
@@ -294,11 +445,24 @@ SYSCALL_DEFINE1(mshare_unlink, const char *, name)
 
 	/*
 	 * Is this the last reference?
+	 * TODO: permission checks are needed before proceeding
 	 */
 	if (refcount_dec_and_test(&info->refcnt)) {
 		simple_unlink(d_inode(msharefs_sb->s_root), dentry);
 		d_drop(dentry);
 		d_delete(dentry);
+		/*
+		 * TODO: Release all physical pages allocated for this
+		 * mshare range and release associated page table. If
+		 * the final unlink happens from the process that created
+		 * mshare'd range, do we return page tables and pages to
+		 * that process so the creating process can continue using
+		 * the address range it had chosen to mshare at some
+		 * point?
+		 *
+		 * TODO: unmap shared vmas from every task that is using
+		 * this mshare'd range.
+		 */
 		mmput(info->mm);
 		kfree(info);
 	} else {
-- 
2.32.0



  parent reply	other threads:[~2022-04-11 16:08 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-11 16:05 [PATCH v1 00/14] Add support for shared PTEs across processes Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 01/14] mm: Add new system calls mshare, mshare_unlink Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 02/14] mm/mshare: Add msharefs filesystem Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 03/14] mm/mshare: Add read for msharefs Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 04/14] mm/mshare: implement mshare_unlink syscall Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 05/14] mm/mshare: Add locking to msharefs syscalls Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 06/14] mm/mshare: Check for mounted filesystem Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 07/14] mm/mshare: Add vm flag for shared PTE Khalid Aziz
2022-04-11 16:05 ` Khalid Aziz [this message]
2022-04-11 18:48   ` [PATCH v1 08/14] mm/mshare: Add basic page table sharing using mshare Dave Hansen
2022-04-11 20:39     ` Khalid Aziz
2022-05-30 11:11   ` Barry Song
2022-06-28 20:11     ` Khalid Aziz
2022-05-31  3:46   ` Barry Song
2022-06-28 20:16     ` Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 09/14] mm/mshare: Do not free PTEs for mshare'd PTEs Khalid Aziz
2022-05-31  4:24   ` Barry Song
2022-06-29 17:38     ` Khalid Aziz
2022-07-03 20:54       ` Andy Lutomirski
2022-07-06 20:33         ` Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 10/14] mm/mshare: Check for mapped vma when mshare'ing existing mshare'd range Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 11/14] mm/mshare: unmap vmas in mshare_unlink Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 12/14] mm/mshare: Add a proc file with mshare alignment/size information Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 13/14] mm/mshare: Enforce mshare'd region permissions Khalid Aziz
2022-04-11 16:05 ` [PATCH v1 14/14] mm/mshare: Copy PTEs to host mm Khalid Aziz
2022-04-11 17:37 ` [PATCH v1 00/14] Add support for shared PTEs across processes Matthew Wilcox
2022-04-11 18:51   ` Dave Hansen
2022-04-11 19:08     ` Matthew Wilcox
2022-04-11 19:52   ` Khalid Aziz
2022-04-11 18:47 ` Dave Hansen
2022-04-11 20:10 ` Eric W. Biederman
2022-04-11 22:21   ` Khalid Aziz
2022-05-30 10:48 ` Barry Song
2022-05-30 11:18   ` David Hildenbrand
2022-05-30 11:49     ` Barry Song
2022-06-29 17:48     ` Khalid Aziz
2022-06-29 17:40   ` Khalid Aziz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5c96dd165d7ec3da14306b8fd857c7eb95a8c3e6.1649370874.git.khalid.aziz@oracle.com \
    --to=khalid.aziz@oracle.com \
    --cc=21cnbao@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=arnd@arndb.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=hagen@jauu.net \
    --cc=jack@suse.cz \
    --cc=keescook@chromium.org \
    --cc=kirill@shutemov.name \
    --cc=kucharsk@gmail.com \
    --cc=linkinjeon@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longpeng2@huawei.com \
    --cc=luto@kernel.org \
    --cc=markhemm@googlemail.com \
    --cc=pcc@google.com \
    --cc=rppt@kernel.org \
    --cc=sieberf@amazon.com \
    --cc=sjpark@amazon.de \
    --cc=surenb@google.com \
    --cc=tst@schoebel-theuer.de \
    --cc=willy@infradead.org \
    --cc=yzaikin@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).