All of lore.kernel.org
 help / color / mirror / Atom feed
From: Khalid Aziz <khalid.aziz@oracle.com>
To: akpm@linux-foundation.org, willy@infradead.org,
	markhemm@googlemail.com, viro@zeniv.linux.org.uk,
	david@redhat.com, mike.kravetz@oracle.com
Cc: Khalid Aziz <khalid.aziz@oracle.com>,
	andreyknvl@gmail.com, dave.hansen@intel.com, luto@kernel.org,
	brauner@kernel.org, arnd@arndb.de, ebiederm@xmission.com,
	catalin.marinas@arm.com, linux-arch@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	mhiramat@kernel.org, rostedt@goodmis.org,
	vasily.averin@linux.dev, xhao@linux.alibaba.com, pcc@google.com,
	neilb@suse.de, maz@kernel.org
Subject: [PATCH RFC v2 4/4] mm/ptshare: Add page fault handling for page table shared regions
Date: Wed, 26 Apr 2023 10:49:51 -0600	[thread overview]
Message-ID: <9edffd2a12a049a42d9a2c216e3f999aae7e65a4.1682453344.git.khalid.aziz@oracle.com> (raw)
In-Reply-To: <cover.1682453344.git.khalid.aziz@oracle.com>

Add support for creating a new set of shared page tables in a new
mm_struct upon mmap of an region that can potentially share page
tables. Add page fault handling for this now shared region. Modify
free_pgtables path to make sure page tables in the shared regions
are kept intact when a process using page table region exits and
there are other mappers for the shared region. Clean up mm_struct
holding shared page tables when the last process sharing the region
exits.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/internal.h |   2 +
 mm/memory.c   | 105 ++++++++++++++++++++++++++++++------
 mm/ptshare.c  | 143 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 232 insertions(+), 18 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 3efb8738e26f..924065f721fe 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1061,4 +1061,6 @@ struct ptshare_data {
 int ptshare_new_mm(struct file *file, struct vm_area_struct *vma);
 void ptshare_del_mm(struct vm_area_struct *vm);
 int ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma);
+extern vm_fault_t find_shared_vma(struct vm_area_struct **vmap,
+				unsigned long *addrp, unsigned int flags);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memory.c b/mm/memory.c
index 01a23ad48a04..c67318ffd001 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -172,17 +172,28 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member)
  * has been handled earlier when unmapping all the memory regions.
  */
 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
-			   unsigned long addr)
+			   unsigned long addr, bool shared_pte)
 {
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
+	/*
+	 * if this address range shares page tables with other processes,
+	 * do not release pte pages. Those pages will be released when
+	 * host mm that hosts these pte pages is released
+	 */
+	if (shared_pte) {
+		tlb_flush_pmd_range(tlb, addr, PAGE_SIZE);
+		tlb->freed_tables = 1;
+		return;
+	}
 	pte_free_tlb(tlb, token, addr);
 	mm_dec_nr_ptes(tlb->mm);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
+				unsigned long floor, unsigned long ceiling,
+				bool shared_pte)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -194,7 +205,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		free_pte_range(tlb, pmd, addr);
+		free_pte_range(tlb, pmd, addr, shared_pte);
 	} while (pmd++, addr = next, addr != end);
 
 	start &= PUD_MASK;
@@ -210,13 +221,19 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
-	pmd_free_tlb(tlb, pmd, start);
-	mm_dec_nr_pmds(tlb->mm);
+	if (shared_pte) {
+		tlb_flush_pud_range(tlb, start, PAGE_SIZE);
+		tlb->freed_tables = 1;
+	} else {
+		pmd_free_tlb(tlb, pmd, start);
+		mm_dec_nr_pmds(tlb->mm);
+	}
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
+				unsigned long floor, unsigned long ceiling,
+				bool shared_pte)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -228,7 +245,8 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+		free_pmd_range(tlb, pud, addr, next, floor, ceiling,
+				shared_pte);
 	} while (pud++, addr = next, addr != end);
 
 	start &= P4D_MASK;
@@ -250,7 +268,8 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 
 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
+				unsigned long floor, unsigned long ceiling,
+				bool shared_pte)
 {
 	p4d_t *p4d;
 	unsigned long next;
@@ -262,7 +281,8 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 		next = p4d_addr_end(addr, end);
 		if (p4d_none_or_clear_bad(p4d))
 			continue;
-		free_pud_range(tlb, p4d, addr, next, floor, ceiling);
+		free_pud_range(tlb, p4d, addr, next, floor, ceiling,
+				shared_pte);
 	} while (p4d++, addr = next, addr != end);
 
 	start &= PGDIR_MASK;
@@ -284,9 +304,10 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
 /*
  * This function frees user-level page tables of a process.
  */
-void free_pgd_range(struct mmu_gather *tlb,
+static void _free_pgd_range(struct mmu_gather *tlb,
 			unsigned long addr, unsigned long end,
-			unsigned long floor, unsigned long ceiling)
+			unsigned long floor, unsigned long ceiling,
+			bool shared_pte)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -342,10 +363,18 @@ void free_pgd_range(struct mmu_gather *tlb,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
+		free_p4d_range(tlb, pgd, addr, next, floor, ceiling,
+				shared_pte);
 	} while (pgd++, addr = next, addr != end);
 }
 
+void free_pgd_range(struct mmu_gather *tlb,
+			unsigned long addr, unsigned long end,
+			unsigned long floor, unsigned long ceiling)
+{
+	_free_pgd_range(tlb, addr, end, floor, ceiling, false);
+}
+
 void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		   struct vm_area_struct *vma, unsigned long floor,
 		   unsigned long ceiling)
@@ -375,16 +404,20 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		} else {
 			/*
 			 * Optimization: gather nearby vmas into one call down
+			 * but make sure vmas not sharing page tables do
+			 * not get combined with vmas sharing page tables
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			       && !is_vm_hugetlb_page(next)) {
+			       && !is_vm_hugetlb_page(next)
+			       && (vma_is_shared(next) == vma_is_shared(vma))) {
 				vma = next;
 				next = mas_find(&mas, ceiling - 1);
 				unlink_anon_vmas(vma);
 				unlink_file_vma(vma);
 			}
-			free_pgd_range(tlb, addr, vma->vm_end,
-				floor, next ? next->vm_start : ceiling);
+			_free_pgd_range(tlb, addr, vma->vm_end,
+				floor, next ? next->vm_start : ceiling,
+				vma_is_shared(vma));
 		}
 		vma = next;
 	} while (vma);
@@ -5181,6 +5214,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 			   unsigned int flags, struct pt_regs *regs)
 {
 	vm_fault_t ret;
+	bool shared = false;
+	struct mm_struct *orig_mm;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -5191,6 +5226,16 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (ret)
 		return ret;
 
+	orig_mm = vma->vm_mm;
+	if (unlikely(vma_is_shared(vma))) {
+		ret = find_shared_vma(&vma, &address, flags);
+		if (ret)
+			return ret;
+		if (!vma)
+			return VM_FAULT_SIGSEGV;
+		shared = true;
+	}
+
 	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
 					    flags & FAULT_FLAG_INSTRUCTION,
 					    flags & FAULT_FLAG_REMOTE))
@@ -5212,6 +5257,36 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
 	lru_gen_exit_fault();
 
+	/*
+	 * Release the read lock on shared VMA's parent mm unless
+	 * __handle_mm_fault released the lock already.
+	 * __handle_mm_fault sets VM_FAULT_RETRY in return value if
+	 * it released mmap lock. If lock was released, that implies
+	 * the lock would have been released on task's original mm if
+	 * this were not a shared PTE vma. To keep lock state consistent,
+	 * make sure to release the lock on task's original mm
+	 */
+	if (shared) {
+		int release_mmlock = 1;
+
+		if (!(ret & VM_FAULT_RETRY)) {
+			mmap_read_unlock(vma->vm_mm);
+			release_mmlock = 0;
+		} else if ((flags & FAULT_FLAG_ALLOW_RETRY) &&
+			(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			mmap_read_unlock(vma->vm_mm);
+			release_mmlock = 0;
+		}
+
+		/*
+		 * Reset guest vma pointers that were set up in
+		 * find_shared_vma() to process this fault.
+		 */
+		vma->vm_mm = orig_mm;
+		if (release_mmlock)
+			mmap_read_unlock(orig_mm);
+	}
+
 	if (flags & FAULT_FLAG_USER) {
 		mem_cgroup_exit_user_fault();
 		/*
diff --git a/mm/ptshare.c b/mm/ptshare.c
index f6784268958c..e0991a877355 100644
--- a/mm/ptshare.c
+++ b/mm/ptshare.c
@@ -13,6 +13,136 @@
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+/*
+ */
+static pmd_t
+*get_pmd(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return NULL;
+
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d)) {
+		p4d = p4d_alloc(mm, pgd, addr);
+		if (!p4d)
+			return NULL;
+	}
+
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud)) {
+		pud = pud_alloc(mm, p4d, addr);
+		if (!pud)
+			return NULL;
+	}
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd)) {
+		pmd = pmd_alloc(mm, pud, addr);
+		if (!pmd)
+			return NULL;
+	}
+
+	return pmd;
+}
+
+/*
+ * Find the shared pmd entries in host mm struct and install them into
+ * guest page tables.
+ */
+static int
+ptshare_copy_pmd(struct mm_struct *host_mm, struct mm_struct *guest_mm,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	pgd_t *guest_pgd;
+	p4d_t *guest_p4d;
+	pud_t *guest_pud;
+	pmd_t *host_pmd;
+	spinlock_t *host_ptl, *guest_ptl;
+
+	guest_pgd = pgd_offset(guest_mm, addr);
+	guest_p4d = p4d_offset(guest_pgd, addr);
+	if (p4d_none(*guest_p4d)) {
+		guest_p4d = p4d_alloc(guest_mm, guest_pgd, addr);
+		if (!guest_p4d)
+			return 1;
+	}
+
+	guest_pud = pud_offset(guest_p4d, addr);
+	if (pud_none(*guest_pud)) {
+		host_pmd = get_pmd(host_mm, addr);
+		if (!host_pmd)
+			return 1;
+
+		get_page(virt_to_page(host_pmd));
+		host_ptl = pmd_lockptr(host_mm, host_pmd);
+		guest_ptl = pud_lockptr(guest_mm, guest_pud);
+		spin_lock(host_ptl);
+		spin_lock(guest_ptl);
+		pud_populate(guest_mm, guest_pud,
+			(pmd_t *)((unsigned long)host_pmd & PAGE_MASK));
+		put_page(virt_to_page(host_pmd));
+		spin_unlock(guest_ptl);
+		spin_unlock(host_ptl);
+	}
+
+	return 0;
+}
+
+/*
+ * Find the shared page tables in hosting mm struct and install those in
+ * the guest mm struct
+ */
+vm_fault_t
+find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp,
+			unsigned int flags)
+{
+	struct ptshare_data *info;
+	struct mm_struct *host_mm;
+	struct vm_area_struct *host_vma, *guest_vma = *vmap;
+	unsigned long host_addr;
+	pmd_t *guest_pmd, *host_pmd;
+
+	if ((!guest_vma->vm_file) || (!guest_vma->vm_file->f_mapping))
+		return 0;
+	info = guest_vma->vm_file->f_mapping->ptshare_data;
+	if (!info) {
+		pr_warn("VM_SHARED_PT vma with NULL ptshare_data");
+		dump_stack_print_info(KERN_WARNING);
+		return 0;
+	}
+	host_mm = info->mm;
+
+	mmap_read_lock(host_mm);
+	host_addr = *addrp - guest_vma->vm_start + host_mm->mmap_base;
+	host_pmd = get_pmd(host_mm, host_addr);
+	guest_pmd = get_pmd(guest_vma->vm_mm, *addrp);
+	if (!pmd_same(*guest_pmd, *host_pmd)) {
+		set_pmd(guest_pmd, *host_pmd);
+		mmap_read_unlock(host_mm);
+		return VM_FAULT_NOPAGE;
+	}
+
+	*addrp = host_addr;
+	host_vma = find_vma(host_mm, host_addr);
+	if (!host_vma)
+		return VM_FAULT_SIGSEGV;
+
+	/*
+	 * Point vm_mm for the faulting vma to the mm struct holding shared
+	 * page tables so the fault handling will happen in the right
+	 * shared context
+	 */
+	guest_vma->vm_mm = host_mm;
+
+	return 0;
+}
+
 /*
  * Create a new mm struct that will hold the shared PTEs. Pointer to
  * this new mm is stored in the data structure ptshare_data which also
@@ -38,6 +168,7 @@ ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
 	new_mm->task_size = len;
 	if (!new_mm->task_size)
 		new_mm->task_size--;
+	new_mm->owner = NULL;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
@@ -63,7 +194,7 @@ ptshare_new_mm(struct file *file, struct vm_area_struct *vma)
  * insert vma into mm holding shared page tables
  */
 int
-ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+ptshare_insert_vma(struct mm_struct *host_mm, struct vm_area_struct *vma)
 {
 	struct vm_area_struct *new_vma;
 	int err = 0;
@@ -80,12 +211,18 @@ ptshare_insert_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	 */
 	vm_flags_clear(new_vma, (VM_SHARED | VM_SHARED_PT));
 	vm_flags_set(new_vma, VM_NOHUGEPAGE);
-	new_vma->vm_mm = mm;
+	new_vma->vm_mm = host_mm;
 
-	err = insert_vm_struct(mm, new_vma);
+	err = insert_vm_struct(host_mm, new_vma);
 	if (err)
 		return -ENOMEM;
 
+	/*
+	 * Copy the PMD entries from host mm to guest so they use the
+	 * same PTEs
+	 */
+	err = ptshare_copy_pmd(host_mm, vma->vm_mm, vma, vma->vm_start);
+
 	return err;
 }
 
-- 
2.37.2


  parent reply	other threads:[~2023-04-26 16:51 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-26 16:49 [PATCH RFC v2 0/4] Add support for sharing page tables across processes (Previously mshare) Khalid Aziz
2023-04-26 16:49 ` [PATCH RFC v2 1/4] mm/ptshare: Add vm flag for shared PTE Khalid Aziz
2023-04-26 16:49 ` [PATCH RFC v2 2/4] mm/ptshare: Add flag MAP_SHARED_PT to mmap() Khalid Aziz
2023-04-27 11:17   ` kernel test robot
2023-04-29  4:41   ` kernel test robot
2023-04-26 16:49 ` [PATCH RFC v2 3/4] mm/ptshare: Create new mm struct for page table sharing Khalid Aziz
2023-06-26  8:08   ` Karim Manaouil
2023-04-26 16:49 ` Khalid Aziz [this message]
2023-04-27  0:24   ` [PATCH RFC v2 4/4] mm/ptshare: Add page fault handling for page table shared regions kernel test robot
2023-04-29 14:07   ` kernel test robot
2023-04-26 21:27 ` [PATCH RFC v2 0/4] Add support for sharing page tables across processes (Previously mshare) Mike Kravetz
2023-04-27 16:40   ` Khalid Aziz
2023-06-12 16:25 ` Peter Xu
2023-06-30 11:29 ` Rongwei Wang
2023-07-31  4:35 ` Rongwei Wang
2023-07-31 12:25   ` Matthew Wilcox
2023-07-31 12:50     ` David Hildenbrand
2023-07-31 16:19       ` Rongwei Wang
2023-07-31 16:30         ` David Hildenbrand
2023-07-31 16:38           ` Matthew Wilcox
2023-07-31 16:48             ` David Hildenbrand
2023-07-31 16:54               ` Matthew Wilcox
2023-07-31 17:06                 ` David Hildenbrand
2023-08-01  6:53             ` Rongwei Wang
2023-08-01 19:28               ` Matthew Wilcox
2023-05-07 22:55 [PATCH RFC v2 4/4] mm/ptshare: Add page fault handling for page table shared regions kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9edffd2a12a049a42d9a2c216e3f999aae7e65a4.1682453344.git.khalid.aziz@oracle.com \
    --to=khalid.aziz@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=andreyknvl@gmail.com \
    --cc=arnd@arndb.de \
    --cc=brauner@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=dave.hansen@intel.com \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=markhemm@googlemail.com \
    --cc=maz@kernel.org \
    --cc=mhiramat@kernel.org \
    --cc=mike.kravetz@oracle.com \
    --cc=neilb@suse.de \
    --cc=pcc@google.com \
    --cc=rostedt@goodmis.org \
    --cc=vasily.averin@linux.dev \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=xhao@linux.alibaba.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.