[PATCH] mmu_notifiers: Notify on pte permission upgrades

* [PATCH] mmu_notifiers: Notify on pte permission upgrades
@ 2023-05-22  6:37 Alistair Popple
  2023-05-22  7:15 ` Qi Zheng
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Alistair Popple @ 2023-05-22  6:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, robin.murphy, will, nicolinc,
	linux-arm-kernel, kvm, kvmarm, jgg, John Hubbard,
	Alistair Popple

Some architectures, specifically ARM and perhaps Sparc and IA64,
require TLB invalidates when upgrading pte permission from read-only
to read-write.

The current mmu_notifier implementation assumes that upgrades do not
need notifications. Typically though mmu_notifiers are used to
implement TLB invalidations for secondary MMUs that comply with the
main CPU architecture.

Therefore if the main CPU architecture requires an invalidation for
permission upgrade the secondary MMU will as well and an mmu_notifier
should be sent for the upgrade.

Currently CPU invalidations for permission upgrade occur in
ptep_set_access_flags(). Unfortunately MMU notifiers cannot be called
directly from this architecture specific code as the notifier
callbacks can sleep, and ptep_set_access_flags() is usually called
whilst holding the PTL spinlock. Therefore add the notifier calls
after the PTL is dropped and only if the PTE actually changed. This
will allow secondary MMUs to obtain an updated PTE with appropriate
permissions.

This problem was discovered during testing of an ARM SMMU
implementation that does not support broadcast TLB maintenance
(BTM). In this case the SMMU driver uses notifiers to issue TLB
invalidates. For read-only to read-write pte upgrades the SMMU
continually returned a read-only PTE to the device, even though the
CPU had a read-write PTE installed.

Sending a mmu notifier event to the SMMU driver fixes the problem by
flushing secondary TLB entries. A new notifier event type is added so
drivers may filter out these invalidations if not required. Note a
driver should never upgrade or install a PTE in response to this mmu
notifier event as it is not synchronised against other PTE operations.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
 include/linux/mmu_notifier.h |  6 +++++
 mm/hugetlb.c                 | 24 ++++++++++++++++++-
 mm/memory.c                  | 45 ++++++++++++++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index d6c06e140277..f14d68f119d8 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -31,6 +31,11 @@ struct mmu_interval_notifier;
  * pages in the range so to mirror those changes the user must inspect the CPU
  * page table (from the end callback).
  *
+ * @MMU_NOTIFY_PROTECTION_UPGRAGE: update is due to a change from read-only to
+ * read-write for pages in the range. This must not be used to upgrade
+ * permissions on secondary PTEs, rather it should only be used to invalidate
+ * caches such as secondary TLBs that may cache old read-only entries.
+ *
  * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
  * access flags). User should soft dirty the page in the end callback to make
  * sure that anyone relying on soft dirtiness catch pages that might be written
@@ -53,6 +58,7 @@ enum mmu_notifier_event {
 	MMU_NOTIFY_CLEAR,
 	MMU_NOTIFY_PROTECTION_VMA,
 	MMU_NOTIFY_PROTECTION_PAGE,
+	MMU_NOTIFY_PROTECTION_UPGRADE,
 	MMU_NOTIFY_SOFT_DIRTY,
 	MMU_NOTIFY_RELEASE,
 	MMU_NOTIFY_MIGRATE,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bdbfeb6fb393..e5d467c7bff7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5987,6 +5987,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	vm_fault_t ret;
 	u32 hash;
 	pgoff_t idx;
+	bool changed = false;
 	struct page *page = NULL;
 	struct page *pagecache_page = NULL;
 	struct hstate *h = hstate_vma(vma);
@@ -6122,6 +6123,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (!huge_pte_write(entry)) {
 			ret = hugetlb_wp(mm, vma, address, ptep, flags,
 					 pagecache_page, ptl);
+			if (!ret)
+				changed = true;
+
 			goto out_put_page;
 		} else if (likely(flags & FAULT_FLAG_WRITE)) {
 			entry = huge_pte_mkdirty(entry);
@@ -6129,8 +6133,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	entry = pte_mkyoung(entry);
 	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
-						flags & FAULT_FLAG_WRITE))
+						flags & FAULT_FLAG_WRITE)) {
 		update_mmu_cache(vma, haddr, ptep);
+		changed = true;
+	}
+
 out_put_page:
 	if (page != pagecache_page)
 		unlock_page(page);
@@ -6138,6 +6145,21 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 out_ptl:
 	spin_unlock(ptl);
 
+	if (changed) {
+		struct mmu_notifier_range range;
+		unsigned long hpage_mask = huge_page_mask(h);
+		unsigned long hpage_size = huge_page_size(h);
+
+		update_mmu_cache(vma, haddr, ptep);
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
+					0, vma, mm, haddr & hpage_mask,
+					(haddr & hpage_mask) + hpage_size);
+		mmu_notifier_invalidate_range_start(&range);
+		mmu_notifier_invalidate_range_end(&range);
+	}
+
+
 	if (pagecache_page) {
 		unlock_page(pagecache_page);
 		put_page(pagecache_page);
diff --git a/mm/memory.c b/mm/memory.c
index f526b9152bef..0ac78c6a232c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2098,6 +2098,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte, entry;
 	spinlock_t *ptl;
+	bool changed = false;
 
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
@@ -2120,8 +2121,10 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			}
 			entry = pte_mkyoung(*pte);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+			if (ptep_set_access_flags(vma, addr, pte, entry, 1)) {
 				update_mmu_cache(vma, addr, pte);
+				changed = true;
+			}
 		}
 		goto out_unlock;
 	}
@@ -2142,6 +2145,17 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
 out_unlock:
 	pte_unmap_unlock(pte, ptl);
+
+	if (changed) {
+		struct mmu_notifier_range range;
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
+					0, vma, mm, addr & PAGE_MASK,
+					(addr & PAGE_MASK) + PAGE_SIZE);
+		mmu_notifier_invalidate_range_start(&range);
+		mmu_notifier_invalidate_range_end(&range);
+	}
+
 	return VM_FAULT_NOPAGE;
 }
 
@@ -2820,6 +2834,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
 	struct vm_area_struct *vma = vmf->vma;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr = vmf->address;
+	bool changed = false;
 
 	if (likely(src)) {
 		if (copy_mc_user_highpage(dst, src, addr, vma)) {
@@ -2858,8 +2873,10 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
 		}
 
 		entry = pte_mkyoung(vmf->orig_pte);
-		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
+		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) {
 			update_mmu_cache(vma, addr, vmf->pte);
+			changed = true;
+		}
 	}
 
 	/*
@@ -2897,6 +2914,16 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
 		}
 	}
 
+	if (changed) {
+		struct mmu_notifier_range range;
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
+					0, vma, vma->vm_mm, addr & PAGE_MASK,
+					(addr & PAGE_MASK) + PAGE_SIZE);
+		mmu_notifier_invalidate_range_start(&range);
+		mmu_notifier_invalidate_range_end(&range);
+	}
+
 	ret = 0;
 
 pte_unlock:
@@ -4877,6 +4904,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 {
 	pte_t entry;
+	bool changed = false;
 
 	if (unlikely(pmd_none(*vmf->pmd))) {
 		/*
@@ -4957,6 +4985,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
 				vmf->flags & FAULT_FLAG_WRITE)) {
 		update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+		changed = true;
 	} else {
 		/* Skip spurious TLB flush for retried page fault */
 		if (vmf->flags & FAULT_FLAG_TRIED)
@@ -4972,6 +5001,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	}
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
+
+	if (changed) {
+		struct mmu_notifier_range range;
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
+					0, vmf->vma, vmf->vma->vm_mm,
+					vmf->address & PAGE_MASK,
+					(vmf->address & PAGE_MASK) + PAGE_SIZE);
+		mmu_notifier_invalidate_range_start(&range);
+		mmu_notifier_invalidate_range_end(&range);
+	}
+
 	return 0;
 }
 
-- 
2.39.2



^ permalink raw reply related	[flat|nested] 15+ messages in thread