All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Adalbert Lazăr" <alazar@bitdefender.com>
To: linux-mm@kvack.org
Cc: linux-api@vger.kernel.org,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"Alexander Graf" <graf@amazon.com>,
	"Stefan Hajnoczi" <stefanha@redhat.com>,
	"Jerome Glisse" <jglisse@redhat.com>,
	"Paolo Bonzini" <pbonzini@redhat.com>,
	"Mihai Donțu" <mdontu@bitdefender.com>,
	"Mircea Cirjaliu" <mcirjaliu@bitdefender.com>,
	"Andy Lutomirski" <luto@kernel.org>,
	"Arnd Bergmann" <arnd@arndb.de>,
	"Sargun Dhillon" <sargun@sargun.me>,
	"Aleksa Sarai" <cyphar@cyphar.com>,
	"Oleg Nesterov" <oleg@redhat.com>, "Jann Horn" <jannh@google.com>,
	"Kees Cook" <keescook@chromium.org>,
	"Matthew Wilcox" <willy@infradead.org>,
	"Christian Brauner" <christian.brauner@ubuntu.com>,
	"Adalbert Lazăr" <alazar@bitdefender.com>
Subject: [RESEND RFC PATCH 2/5] mm: let the VMA decide how zap_pte_range() acts on mapped pages
Date: Fri,  4 Sep 2020 14:31:13 +0300	[thread overview]
Message-ID: <20200904113116.20648-3-alazar@bitdefender.com> (raw)
In-Reply-To: <20200904113116.20648-1-alazar@bitdefender.com>

From: Mircea Cirjaliu <mcirjaliu@bitdefender.com>

Instead of having one big function to handle all cases of page unmapping,
have multiple implementation-defined callbacks, each for its own VMA type.
In the future, exotic VMA implementations won't have to bloat the unique
zapping function with another case of mappings.

Signed-off-by: Mircea Cirjaliu <mcirjaliu@bitdefender.com>
Signed-off-by: Adalbert Lazăr <alazar@bitdefender.com>
---
 include/linux/mm.h |  16 ++++
 mm/memory.c        | 182 +++++++++++++++++++++++++--------------------
 2 files changed, 116 insertions(+), 82 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1be4482a7b81..39e55467aa49 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -36,6 +36,7 @@ struct file_ra_state;
 struct user_struct;
 struct writeback_control;
 struct bdi_writeback;
+struct zap_details;
 
 void init_mm_internals(void);
 
@@ -601,6 +602,14 @@ struct vm_operations_struct {
 	 */
 	struct page *(*find_special_page)(struct vm_area_struct *vma,
 					  unsigned long addr);
+
+	/*
+	 * Called by zap_pte_range() for use by special VMAs that implement
+	 * custom zapping behavior.
+	 */
+	int (*zap_pte)(struct vm_area_struct *vma, unsigned long addr,
+		       pte_t *pte, int rss[], struct mmu_gather *tlb,
+		       struct zap_details *details);
 };
 
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
@@ -1594,6 +1603,13 @@ static inline bool can_do_mlock(void) { return false; }
 extern int user_shm_lock(size_t, struct user_struct *);
 extern void user_shm_unlock(size_t, struct user_struct *);
 
+/*
+ * Flags returned by zap_pte implementations
+ */
+#define ZAP_PTE_CONTINUE	0
+#define ZAP_PTE_FLUSH		(1 << 0)	/* Ask for TLB flush. */
+#define ZAP_PTE_BREAK		(1 << 1)	/* Break PTE iteration. */
+
 /*
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
diff --git a/mm/memory.c b/mm/memory.c
index 8e78fb151f8f..a225bfd01417 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1031,18 +1031,109 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return ret;
 }
 
+static int zap_pte_common(struct vm_area_struct *vma, unsigned long addr,
+			  pte_t *pte, int rss[], struct mmu_gather *tlb,
+			  struct zap_details *details)
+{
+	struct mm_struct *mm = tlb->mm;
+	pte_t ptent = *pte;
+	swp_entry_t entry;
+	int flags = 0;
+
+	if (pte_present(ptent)) {
+		struct page *page;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (unlikely(details) && page) {
+			/*
+			 * unmap_shared_mapping_pages() wants to
+			 * invalidate cache without truncating:
+			 * unmap shared but keep private pages.
+			 */
+			if (details->check_mapping &&
+			    details->check_mapping != page_rmapping(page))
+				return 0;
+		}
+		ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+		tlb_remove_tlb_entry(tlb, pte, addr);
+		if (unlikely(!page))
+			return 0;
+
+		if (!PageAnon(page)) {
+			if (pte_dirty(ptent)) {
+				flags |= ZAP_PTE_FLUSH;
+				set_page_dirty(page);
+			}
+			if (pte_young(ptent) &&
+			    likely(!(vma->vm_flags & VM_SEQ_READ)))
+				mark_page_accessed(page);
+		}
+		rss[mm_counter(page)]--;
+		page_remove_rmap(page, false);
+		if (unlikely(page_mapcount(page) < 0))
+			print_bad_pte(vma, addr, ptent, page);
+		if (unlikely(__tlb_remove_page(tlb, page)))
+			flags |= ZAP_PTE_FLUSH | ZAP_PTE_BREAK;
+		return flags;
+	}
+
+	entry = pte_to_swp_entry(ptent);
+	if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+		struct page *page = device_private_entry_to_page(entry);
+
+		if (unlikely(details && details->check_mapping)) {
+			/*
+			 * unmap_shared_mapping_pages() wants to
+			 * invalidate cache without truncating:
+			 * unmap shared but keep private pages.
+			 */
+			if (details->check_mapping != page_rmapping(page))
+				return 0;
+		}
+
+		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+		rss[mm_counter(page)]--;
+		page_remove_rmap(page, false);
+		put_page(page);
+		return 0;
+	}
+
+	/* If details->check_mapping, we leave swap entries. */
+	if (unlikely(details))
+		return 0;
+
+	if (!non_swap_entry(entry))
+		rss[MM_SWAPENTS]--;
+	else if (is_migration_entry(entry)) {
+		struct page *page;
+
+		page = migration_entry_to_page(entry);
+		rss[mm_counter(page)]--;
+	}
+	if (unlikely(!free_swap_and_cache(entry)))
+		print_bad_pte(vma, addr, ptent, NULL);
+	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+
+	return flags;
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
-	int force_flush = 0;
+	int flags = 0;
 	int rss[NR_MM_COUNTERS];
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
-	swp_entry_t entry;
+
+	int (*zap_pte)(struct vm_area_struct *vma, unsigned long addr,
+		       pte_t *pte, int rss[], struct mmu_gather *tlb,
+		       struct zap_details *details) = zap_pte_common;
+	if (vma->vm_ops && vma->vm_ops->zap_pte)
+		zap_pte = vma->vm_ops->zap_pte;
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
 again:
@@ -1058,92 +1149,19 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 
 		if (!zap_is_atomic(details) && need_resched())
 			break;
-
-		if (pte_present(ptent)) {
-			struct page *page;
-
-			page = vm_normal_page(vma, addr, ptent);
-			if (unlikely(details) && page) {
-				/*
-				 * unmap_shared_mapping_pages() wants to
-				 * invalidate cache without truncating:
-				 * unmap shared but keep private pages.
-				 */
-				if (details->check_mapping &&
-				    details->check_mapping != page_rmapping(page))
-					continue;
-			}
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
-							tlb->fullmm);
-			tlb_remove_tlb_entry(tlb, pte, addr);
-			if (unlikely(!page))
-				continue;
-
-			if (!PageAnon(page)) {
-				if (pte_dirty(ptent)) {
-					force_flush = 1;
-					set_page_dirty(page);
-				}
-				if (pte_young(ptent) &&
-				    likely(!(vma->vm_flags & VM_SEQ_READ)))
-					mark_page_accessed(page);
-			}
-			rss[mm_counter(page)]--;
-			page_remove_rmap(page, false);
-			if (unlikely(page_mapcount(page) < 0))
-				print_bad_pte(vma, addr, ptent, page);
-			if (unlikely(__tlb_remove_page(tlb, page))) {
-				force_flush = 1;
-				addr += PAGE_SIZE;
-				break;
-			}
-			continue;
-		}
-
-		entry = pte_to_swp_entry(ptent);
-		if (non_swap_entry(entry) && is_device_private_entry(entry)) {
-			struct page *page = device_private_entry_to_page(entry);
-
-			if (unlikely(details && details->check_mapping)) {
-				/*
-				 * unmap_shared_mapping_pages() wants to
-				 * invalidate cache without truncating:
-				 * unmap shared but keep private pages.
-				 */
-				if (details->check_mapping !=
-				    page_rmapping(page))
-					continue;
-			}
-
-			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-			rss[mm_counter(page)]--;
-			page_remove_rmap(page, false);
-			put_page(page);
-			continue;
+		if (flags & ZAP_PTE_BREAK) {
+			flags &= ~ZAP_PTE_BREAK;
+			break;
 		}
 
-		/* If details->check_mapping, we leave swap entries. */
-		if (unlikely(details))
-			continue;
-
-		if (!non_swap_entry(entry))
-			rss[MM_SWAPENTS]--;
-		else if (is_migration_entry(entry)) {
-			struct page *page;
-
-			page = migration_entry_to_page(entry);
-			rss[mm_counter(page)]--;
-		}
-		if (unlikely(!free_swap_and_cache(entry)))
-			print_bad_pte(vma, addr, ptent, NULL);
-		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+		flags |= zap_pte(vma, addr, pte, rss, tlb, details);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	add_mm_rss_vec(mm, rss);
 	arch_leave_lazy_mmu_mode();
 
 	/* Do the actual TLB flush before dropping ptl */
-	if (force_flush)
+	if (flags & ZAP_PTE_FLUSH)
 		tlb_flush_mmu_tlbonly(tlb);
 	pte_unmap_unlock(start_pte, ptl);
 
@@ -1153,8 +1171,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	 * entries before releasing the ptl), free the batched
 	 * memory too. Restart if we didn't do everything.
 	 */
-	if (force_flush) {
-		force_flush = 0;
+	if (flags & ZAP_PTE_FLUSH) {
+		flags &= ~ZAP_PTE_FLUSH;
 		tlb_flush_mmu(tlb);
 	}
 

  parent reply	other threads:[~2020-09-04 11:41 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-04 11:31 [RESEND RFC PATCH 0/5] Remote mapping Adalbert Lazăr
2020-09-04 11:31 ` [RESEND RFC PATCH 1/5] mm: add atomic capability to zap_details Adalbert Lazăr
2020-09-04 11:31 ` Adalbert Lazăr [this message]
2020-09-04 11:31 ` [RESEND RFC PATCH 3/5] mm/mmu_notifier: remove lockdep map, allow mmu notifier to be used in nested scenarios Adalbert Lazăr
2020-09-04 12:03   ` Jason Gunthorpe
2020-09-04 11:31 ` [RESEND RFC PATCH 4/5] mm/remote_mapping: use a pidfd to access memory belonging to unrelated process Adalbert Lazăr
2020-09-04 17:55   ` Oleg Nesterov
2020-09-07 14:30   ` Oleg Nesterov
2020-09-07 15:16     ` Adalbert Lazăr
2020-09-09  8:32     ` Mircea CIRJALIU - MELIU
2020-09-10 16:43       ` Oleg Nesterov
2020-09-07 15:02   ` Christian Brauner
2020-09-07 16:04     ` Mircea CIRJALIU - MELIU
2020-09-04 11:31 ` [RESEND RFC PATCH 5/5] pidfd_mem: implemented remote memory mapping system call Adalbert Lazăr
2020-09-04 19:18   ` Florian Weimer
2020-09-07 14:55   ` Christian Brauner
2020-09-04 12:11 ` [RESEND RFC PATCH 0/5] Remote mapping Jason Gunthorpe
2020-09-04 13:24   ` Mircea CIRJALIU - MELIU
2020-09-04 13:39     ` Jason Gunthorpe
2020-09-04 14:18       ` Mircea CIRJALIU - MELIU
2020-09-04 14:39         ` Jason Gunthorpe
2020-09-04 15:40           ` Mircea CIRJALIU - MELIU
2020-09-04 16:11             ` Jason Gunthorpe
2020-09-04 19:41   ` Matthew Wilcox
2020-09-04 19:49     ` Jason Gunthorpe
2020-09-04 20:08     ` Paolo Bonzini
2020-12-01 18:01     ` Jason Gunthorpe
2020-09-04 19:19 ` Florian Weimer
2020-09-04 20:18   ` Paolo Bonzini
2020-09-07  8:33     ` Christian Brauner
2020-09-04 19:39 ` Andy Lutomirski
2020-09-04 20:09   ` Paolo Bonzini
2020-09-04 20:34     ` Andy Lutomirski
2020-09-04 21:58       ` Paolo Bonzini
2020-09-04 23:17         ` Andy Lutomirski
2020-09-05 18:27           ` Paolo Bonzini
2020-09-07  8:38             ` Christian Brauner
2020-09-07 12:41           ` Mircea CIRJALIU - MELIU
2020-09-07  7:05         ` Christoph Hellwig
2020-09-07  8:44           ` Paolo Bonzini
2020-09-07 10:25   ` Mircea CIRJALIU - MELIU
2020-09-07 15:05 ` Christian Brauner
2020-09-07 20:43   ` Andy Lutomirski
2020-09-09 11:38     ` Stefan Hajnoczi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200904113116.20648-3-alazar@bitdefender.com \
    --to=alazar@bitdefender.com \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=christian.brauner@ubuntu.com \
    --cc=cyphar@cyphar.com \
    --cc=graf@amazon.com \
    --cc=jannh@google.com \
    --cc=jglisse@redhat.com \
    --cc=keescook@chromium.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mcirjaliu@bitdefender.com \
    --cc=mdontu@bitdefender.com \
    --cc=oleg@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=sargun@sargun.me \
    --cc=stefanha@redhat.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.