All of lore.kernel.org
 help / color / mirror / Atom feed
From: Khalid Aziz <khalid.aziz@oracle.com>
To: akpm@linux-foundation.org, willy@infradead.org,
	markhemm@googlemail.com, viro@zeniv.linux.org.uk,
	david@redhat.com, mike.kravetz@oracle.com
Cc: Khalid Aziz <khalid.aziz@oracle.com>,
	andreyknvl@gmail.com, dave.hansen@intel.com, luto@kernel.org,
	brauner@kernel.org, arnd@arndb.de, ebiederm@xmission.com,
	catalin.marinas@arm.com, linux-arch@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	mhiramat@kernel.org, rostedt@goodmis.org,
	vasily.averin@linux.dev, xhao@linux.alibaba.com, pcc@google.com,
	neilb@suse.de, maz@kernel.org
Subject: [PATCH RFC v2 3/4] mm/ptshare: Create new mm struct for page table sharing
Date: Wed, 26 Apr 2023 10:49:50 -0600	[thread overview]
Message-ID: <1fd52581f4e4960a4d07cb9784d56659ec139d3c.1682453344.git.khalid.aziz@oracle.com> (raw)
In-Reply-To: <cover.1682453344.git.khalid.aziz@oracle.com>

When a process passes MAP_SHARED_PT flag to mmap(), create a new mm
struct to hold the shareable page table entries for the newly mapped
region.  This new mm is not associated with a task.  Its lifetime is
until the last shared mapping is deleted.  This patch also adds a
new pointer "ptshare_data" to struct address_space which points to
the data structure that will contain pointer to this newly created
mm along with properties of the shared mapping. ptshare_data
maintains a refcount for the shared mapping so that it can be
cleaned up upon last unmap.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/fs.h |   2 +
 mm/Makefile        |   2 +-
 mm/internal.h      |  14 +++++
 mm/mmap.c          |  72 ++++++++++++++++++++++++++
 mm/ptshare.c       | 126 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 mm/ptshare.c

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..db8d3257c712 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -422,6 +422,7 @@ extern const struct address_space_operations empty_aops;
  * @private_lock: For use by the owner of the address_space.
  * @private_list: For use by the owner of the address_space.
  * @private_data: For use by the owner of the address_space.
+ * @ptshare_data: For shared page table use
  */
 struct address_space {
 	struct inode		*host;
@@ -443,6 +444,7 @@ struct address_space {
 	spinlock_t		private_lock;
 	struct list_head	private_list;
 	void			*private_data;
+	void			*ptshare_data;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..d9bb14fdf220 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -40,7 +40,7 @@ mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
 			   msync.o page_vma_mapped.o pagewalk.o \
-			   pgtable-generic.o rmap.o vmalloc.o
+			   pgtable-generic.o rmap.o vmalloc.o ptshare.o
 
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/internal.h b/mm/internal.h
index 4d60d2d5fe19..3efb8738e26f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1047,4 +1047,18 @@ static inline bool vma_is_shared(const struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_SHARED_PT;
 }
+
+/*
+ * mm/ptshare.c
+ */
+struct ptshare_data {
+	struct mm_struct *mm;
+	refcount_t refcnt;
+	unsigned long start;
+	unsigned long size;
+	unsigned long mode;
+};
+int ptshare_new_mm(struct file *file, struct vm_area_struct *vma);
+void ptshare_del_mm(struct vm_area_struct *vm);
+int ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 8b46d465f8d4..c5e9b7f6de90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1382,6 +1382,60 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
 		*populate = len;
+
+#if VM_SHARED_PT
+	/*
+	 * Check if this mapping is a candidate for page table sharing
+	 * at PMD level. It is if following conditions hold:
+	 *	- It is not anonymous mapping
+	 *	- It is not hugetlbfs mapping (for now)
+	 *	- flags conatins MAP_SHARED or MAP_SHARED_VALIDATE and
+	 *	  MAP_SHARED_PT
+	 *	- Start address is aligned to PMD size
+	 *	- Mapping size is a multiple of PMD size
+	 */
+	if (ptshare && file && !is_file_hugepages(file)) {
+		struct vm_area_struct *vma;
+
+		vma = find_vma(mm, addr);
+		if (!((vma->vm_start | vma->vm_end) & (PMD_SIZE - 1))) {
+			struct ptshare_data *info = file->f_mapping->ptshare_data;
+			/*
+			 * If this mapping has not been set up for page table
+			 * sharing yet, do so by creating a new mm to hold the
+			 * shared page tables for this mapping
+			 */
+			if (info == NULL) {
+				int ret;
+
+				ret = ptshare_new_mm(file, vma);
+				if (ret < 0)
+					return ret;
+
+				info = file->f_mapping->ptshare_data;
+				ret = ptshare_insert_vma(info->mm, vma);
+				if (ret < 0)
+					addr = ret;
+				else
+					vm_flags_set(vma, VM_SHARED_PT);
+			} else {
+				/*
+				 * Page tables will be shared only if the
+				 * file is mapped in with the same permissions
+				 * across all mappers with same starting
+				 * address and size
+				 */
+				if (((prot & info->mode) == info->mode) &&
+					(addr == info->start) &&
+					(len == info->size)) {
+					vm_flags_set(vma, VM_SHARED_PT);
+					refcount_inc(&info->refcnt);
+				}
+			}
+		}
+	}
+#endif
+
 	return addr;
 }
 
@@ -2495,6 +2549,22 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 	if (end == start)
 		return -EINVAL;
 
+	/*
+	 * Check if this vma uses shared page tables
+	 */
+	vma = find_vma_intersection(mm, start, end);
+	if (vma && unlikely(vma_is_shared(vma))) {
+		struct ptshare_data *info = NULL;
+
+		if (vma->vm_file && vma->vm_file->f_mapping)
+			info = vma->vm_file->f_mapping->ptshare_data;
+		/* Don't allow partial munmaps */
+		if (info && ((start != info->start) || (len != info->size)))
+			return -EINVAL;
+		ptshare_del_mm(vma);
+	}
+
+
 	 /* arch_unmap() might do unmaps itself.  */
 	arch_unmap(mm, start, end);
 
@@ -2664,6 +2734,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 			}
 		}
 
+		if (vm_flags & VM_SHARED_PT)
+			vm_flags_set(vma, VM_SHARED_PT);
 		vm_flags = vma->vm_flags;
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
diff --git a/mm/ptshare.c b/mm/ptshare.c
new file mode 100644
index 000000000000..f6784268958c
--- /dev/null
+++ b/mm/ptshare.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Share page table entries when possible to reduce the amount of extra
+ * memory consumed by page tables
+ *
+ * Copyright (C) 2022 Oracle Corp. All rights reserved.
+ * Authors:	Khalid Aziz <khalid.aziz@oracle.com>
+ *		Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * Create a new mm struct that will hold the shared PTEs. Pointer to
+ * this new mm is stored in the data structure ptshare_data which also
+ * includes a refcount for any current references to PTEs in this new
+ * mm. This refcount is used to determine when the mm struct for shared
+ * PTEs can be deleted.
+ */
+int
+ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
+{
+	struct mm_struct *new_mm;
+	struct ptshare_data *info = NULL;
+	int retval = 0;
+	unsigned long start = vma->vm_start;
+	unsigned long len = vma->vm_end - vma->vm_start;
+
+	new_mm = mm_alloc();
+	if (!new_mm) {
+		retval = -ENOMEM;
+		goto err_free;
+	}
+	new_mm->mmap_base = start;
+	new_mm->task_size = len;
+	if (!new_mm->task_size)
+		new_mm->task_size--;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		retval = -ENOMEM;
+		goto err_free;
+	}
+	info->mm = new_mm;
+	info->start = start;
+	info->size = len;
+	refcount_set(&info->refcnt, 1);
+	file->f_mapping->ptshare_data = info;
+
+	return retval;
+
+err_free:
+	if (new_mm)
+		mmput(new_mm);
+	kfree(info);
+	return retval;
+}
+
+/*
+ * insert vma into mm holding shared page tables
+ */
+int
+ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+	struct vm_area_struct *new_vma;
+	int err = 0;
+
+	new_vma = vm_area_dup(vma);
+	if (!new_vma)
+		return -ENOMEM;
+
+	new_vma->vm_file = NULL;
+	/*
+	 * This new vma belongs to host mm, so clear the VM_SHARED_PT
+	 * flag on this so we know this is the host vma when we clean
+	 * up page tables. Do not use THP for page table shared regions
+	 */
+	vm_flags_clear(new_vma, (VM_SHARED | VM_SHARED_PT));
+	vm_flags_set(new_vma, VM_NOHUGEPAGE);
+	new_vma->vm_mm = mm;
+
+	err = insert_vm_struct(mm, new_vma);
+	if (err)
+		return -ENOMEM;
+
+	return err;
+}
+
+/*
+ * Free the mm struct created to hold shared PTEs and associated data
+ * structures
+ */
+static inline void
+free_ptshare_mm(struct ptshare_data *info)
+{
+	mmput(info->mm);
+	kfree(info);
+}
+
+/*
+ * This function is called when a reference to the shared PTEs in mm
+ * struct is dropped. It updates refcount and checks to see if last
+ * reference to the mm struct holding shared PTEs has been dropped. If
+ * so, it cleans up the mm struct and associated data structures
+ */
+void
+ptshare_del_mm(struct vm_area_struct *vma)
+{
+	struct ptshare_data *info;
+	struct file *file = vma->vm_file;
+
+	if (!file || (!file->f_mapping))
+		return;
+	info = file->f_mapping->ptshare_data;
+	WARN_ON(!info);
+	if (!info)
+		return;
+
+	if (refcount_dec_and_test(&info->refcnt)) {
+		free_ptshare_mm(info);
+		file->f_mapping->ptshare_data = NULL;
+	}
+}
-- 
2.37.2


  parent reply	other threads:[~2023-04-26 16:51 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-26 16:49 [PATCH RFC v2 0/4] Add support for sharing page tables across processes (Previously mshare) Khalid Aziz
2023-04-26 16:49 ` [PATCH RFC v2 1/4] mm/ptshare: Add vm flag for shared PTE Khalid Aziz
2023-04-26 16:49 ` [PATCH RFC v2 2/4] mm/ptshare: Add flag MAP_SHARED_PT to mmap() Khalid Aziz
2023-04-27 11:17   ` kernel test robot
2023-04-29  4:41   ` kernel test robot
2023-04-26 16:49 ` Khalid Aziz [this message]
2023-06-26  8:08   ` [PATCH RFC v2 3/4] mm/ptshare: Create new mm struct for page table sharing Karim Manaouil
2023-04-26 16:49 ` [PATCH RFC v2 4/4] mm/ptshare: Add page fault handling for page table shared regions Khalid Aziz
2023-04-27  0:24   ` kernel test robot
2023-04-29 14:07   ` kernel test robot
2023-04-26 21:27 ` [PATCH RFC v2 0/4] Add support for sharing page tables across processes (Previously mshare) Mike Kravetz
2023-04-27 16:40   ` Khalid Aziz
2023-06-12 16:25 ` Peter Xu
2023-06-30 11:29 ` Rongwei Wang
2023-07-31  4:35 ` Rongwei Wang
2023-07-31 12:25   ` Matthew Wilcox
2023-07-31 12:50     ` David Hildenbrand
2023-07-31 16:19       ` Rongwei Wang
2023-07-31 16:30         ` David Hildenbrand
2023-07-31 16:38           ` Matthew Wilcox
2023-07-31 16:48             ` David Hildenbrand
2023-07-31 16:54               ` Matthew Wilcox
2023-07-31 17:06                 ` David Hildenbrand
2023-08-01  6:53             ` Rongwei Wang
2023-08-01 19:28               ` Matthew Wilcox
2023-05-07 14:13 [PATCH RFC v2 3/4] mm/ptshare: Create new mm struct for page table sharing kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1fd52581f4e4960a4d07cb9784d56659ec139d3c.1682453344.git.khalid.aziz@oracle.com \
    --to=khalid.aziz@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=andreyknvl@gmail.com \
    --cc=arnd@arndb.de \
    --cc=brauner@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=dave.hansen@intel.com \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=markhemm@googlemail.com \
    --cc=maz@kernel.org \
    --cc=mhiramat@kernel.org \
    --cc=mike.kravetz@oracle.com \
    --cc=neilb@suse.de \
    --cc=pcc@google.com \
    --cc=rostedt@goodmis.org \
    --cc=vasily.averin@linux.dev \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=xhao@linux.alibaba.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.