All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/8] Some cleanups for memory-failure
@ 2024-02-29 21:20 Matthew Wilcox (Oracle)
  2024-02-29 21:20 ` [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill Matthew Wilcox (Oracle)
                   ` (8 more replies)
  0 siblings, 9 replies; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle), linux-mm, Miaohe Lin, Naoya Horiguchi

This isn't my complete collection of patches to memory-failure, but
they're the ones I have the most confidence in.  I haven't done any real
testing of this beyond compiling & running on a system without trying to
do any memory failure injections.  Is there a good test-suite I should
run before submitting these kinds of patches?

Matthew Wilcox (Oracle) (8):
  mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  mm/memory-failure: Pass addr to __add_to_kill()
  mm: Return the address from page_mapped_in_vma()
  mm/memory-failure: Convert shake_page() to shake_folio()
  mm: Convert hugetlb_page_mapping_lock_write to folio
  mm/memory-failure: Convert memory_failure() to use a folio
  mm/memory-failure: Convert hwpoison_user_mappings to take a folio
  mm/memory-failure: Add some folio conversions to unpoison_memory

 include/linux/hugetlb.h |   6 +-
 include/linux/mm.h      |   1 -
 include/linux/rmap.h    |   2 +-
 mm/hugetlb.c            |   6 +-
 mm/hwpoison-inject.c    |  11 +--
 mm/internal.h           |   1 +
 mm/memory-failure.c     | 144 +++++++++++++++++++++-------------------
 mm/migrate.c            |   2 +-
 mm/page_vma_mapped.c    |  14 ++--
 9 files changed, 97 insertions(+), 90 deletions(-)

-- 
2.43.0



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-04 12:09   ` Miaohe Lin
                     ` (2 more replies)
  2024-02-29 21:20 ` [PATCH 2/8] mm/memory-failure: Pass addr to __add_to_kill() Matthew Wilcox (Oracle)
                   ` (7 subsequent siblings)
  8 siblings, 3 replies; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle),
	linux-mm, Miaohe Lin, Naoya Horiguchi, Dan Williams

Unify the KSM and DAX codepaths by calculating the addr in
add_to_kill_fsdax() instead of telling __add_to_kill() to calculate it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Dan Williams <dan.j.williams@intel.com>
---
 mm/memory-failure.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9349948f1abf..9356227a50bb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -416,21 +416,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
  * not much we can do.	We just print a message and ignore otherwise.
  */
 
-#define FSDAX_INVALID_PGOFF ULONG_MAX
-
 /*
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
- *
- * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
- * filesystem with a memory failure handler has claimed the
- * memory_failure event. In all other cases, page->index and
- * page->mapping are sufficient for mapping the page back to its
- * corresponding user virtual address.
  */
 static void __add_to_kill(struct task_struct *tsk, struct page *p,
 			  struct vm_area_struct *vma, struct list_head *to_kill,
-			  unsigned long ksm_addr, pgoff_t fsdax_pgoff)
+			  unsigned long addr)
 {
 	struct to_kill *tk;
 
@@ -440,12 +432,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 		return;
 	}
 
-	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
-	if (is_zone_device_page(p)) {
-		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
-			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+	tk->addr = addr ? addr : page_address_in_vma(p, vma);
+	if (is_zone_device_page(p))
 		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
-	} else
+	else
 		tk->size_shift = page_shift(compound_head(p));
 
 	/*
@@ -475,7 +465,7 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
 				  struct vm_area_struct *vma,
 				  struct list_head *to_kill)
 {
-	__add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
+	__add_to_kill(tsk, p, vma, to_kill, 0);
 }
 
 #ifdef CONFIG_KSM
@@ -493,10 +483,10 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
 }
 void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
 		     struct vm_area_struct *vma, struct list_head *to_kill,
-		     unsigned long ksm_addr)
+		     unsigned long addr)
 {
 	if (!task_in_to_kill_list(to_kill, tsk))
-		__add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
+		__add_to_kill(tsk, p, vma, to_kill, addr);
 }
 #endif
 /*
@@ -670,7 +660,8 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
 			      struct vm_area_struct *vma,
 			      struct list_head *to_kill, pgoff_t pgoff)
 {
-	__add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
+	unsigned long addr = vma_pgoff_address(pgoff, 1, vma);
+	__add_to_kill(tsk, p, vma, to_kill, addr);
 }
 
 /*
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 2/8] mm/memory-failure: Pass addr to __add_to_kill()
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
  2024-02-29 21:20 ` [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-04 12:10   ` Miaohe Lin
  2024-02-29 21:20 ` [PATCH 3/8] mm: Return the address from page_mapped_in_vma() Matthew Wilcox (Oracle)
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle),
	linux-mm, Miaohe Lin, Naoya Horiguchi, Longlong Xia

Handle anon/file folios the same way as KSM & DAX folios by passing in
the address.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Longlong Xia <xialonglong1@huawei.com>
---
 mm/memory-failure.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9356227a50bb..7f8473c08ae3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -432,7 +432,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 		return;
 	}
 
-	tk->addr = addr ? addr : page_address_in_vma(p, vma);
+	tk->addr = addr;
 	if (is_zone_device_page(p))
 		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
 	else
@@ -465,7 +465,8 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
 				  struct vm_area_struct *vma,
 				  struct list_head *to_kill)
 {
-	__add_to_kill(tsk, p, vma, to_kill, 0);
+	unsigned long addr = page_address_in_vma(p, vma);
+	__add_to_kill(tsk, p, vma, to_kill, addr);
 }
 
 #ifdef CONFIG_KSM
@@ -481,6 +482,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
 
 	return false;
 }
+
 void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
 		     struct vm_area_struct *vma, struct list_head *to_kill,
 		     unsigned long addr)
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 3/8] mm: Return the address from page_mapped_in_vma()
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
  2024-02-29 21:20 ` [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill Matthew Wilcox (Oracle)
  2024-02-29 21:20 ` [PATCH 2/8] mm/memory-failure: Pass addr to __add_to_kill() Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-04 12:31   ` Miaohe Lin
  2024-03-06  8:17   ` Miaohe Lin
  2024-02-29 21:20 ` [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio() Matthew Wilcox (Oracle)
                   ` (5 subsequent siblings)
  8 siblings, 2 replies; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle), linux-mm, Miaohe Lin, Naoya Horiguchi

The only user of this function calls page_address_in_vma() immediately
after page_mapped_in_vma() calculates it and uses it to return true/false.
Return the address instead, allowing memory-failure to skip the call
to page_address_in_vma().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/rmap.h |  2 +-
 mm/memory-failure.c  | 22 ++++++++++++++--------
 mm/page_vma_mapped.c | 14 +++++++-------
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b7944a833668..ba027a4d9abf 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -698,7 +698,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 
 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 
-int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
 /*
  * rmap_walk_control: To control rmap traversing for specific needs
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7f8473c08ae3..40a8964954e5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -462,10 +462,11 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 }
 
 static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
-				  struct vm_area_struct *vma,
-				  struct list_head *to_kill)
+		struct vm_area_struct *vma, struct list_head *to_kill,
+		unsigned long addr)
 {
-	unsigned long addr = page_address_in_vma(p, vma);
+	if (addr == -EFAULT)
+		return;
 	__add_to_kill(tsk, p, vma, to_kill, addr);
 }
 
@@ -609,12 +610,13 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
 			continue;
 		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
 					       pgoff, pgoff) {
+			unsigned long addr;
+
 			vma = vmac->vma;
 			if (vma->vm_mm != t->mm)
 				continue;
-			if (!page_mapped_in_vma(page, vma))
-				continue;
-			add_to_kill_anon_file(t, page, vma, to_kill);
+			addr = page_mapped_in_vma(page, vma);
+			add_to_kill_anon_file(t, page, vma, to_kill, addr);
 		}
 	}
 	rcu_read_unlock();
@@ -642,6 +644,8 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 			continue;
 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 				      pgoff) {
+			unsigned long addr;
+
 			/*
 			 * Send early kill signal to tasks where a vma covers
 			 * the page but the corrupted page is not necessarily
@@ -649,8 +653,10 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 			 * Assume applications who requested early kill want
 			 * to be informed of all such data corruptions.
 			 */
-			if (vma->vm_mm == t->mm)
-				add_to_kill_anon_file(t, page, vma, to_kill);
+			if (vma->vm_mm != t->mm)
+				continue;
+			addr = page_address_in_vma(page, vma);
+			add_to_kill_anon_file(t, page, vma, to_kill, addr);
 		}
 	}
 	rcu_read_unlock();
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 74d2de15fb5e..e9e208b4ac4b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -319,11 +319,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
  * @page: the page to test
  * @vma: the VMA to test
  *
- * Returns 1 if the page is mapped into the page tables of the VMA, 0
- * if the page is not mapped into the page tables of this VMA.  Only
- * valid for normal file or anonymous VMAs.
+ * Return: If the page is mapped into the page tables of the VMA, the
+ * address that the page is mapped at.  -EFAULT if the page is not mapped.
+ * Only valid for normal file or anonymous VMAs.
  */
-int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 {
 	struct page_vma_mapped_walk pvmw = {
 		.pfn = page_to_pfn(page),
@@ -334,9 +334,9 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 
 	pvmw.address = vma_address(page, vma);
 	if (pvmw.address == -EFAULT)
-		return 0;
+		return -EFAULT;
 	if (!page_vma_mapped_walk(&pvmw))
-		return 0;
+		return -EFAULT;
 	page_vma_mapped_walk_done(&pvmw);
-	return 1;
+	return pvmw.address;
 }
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio()
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
                   ` (2 preceding siblings ...)
  2024-02-29 21:20 ` [PATCH 3/8] mm: Return the address from page_mapped_in_vma() Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-06  9:31   ` Miaohe Lin
  2024-02-29 21:20 ` [PATCH 5/8] mm: Convert hugetlb_page_mapping_lock_write to folio Matthew Wilcox (Oracle)
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle), linux-mm, Miaohe Lin, Naoya Horiguchi

Removes two calls to compound_head().  Move the prototype to
internal.h; we definitely don't want code outside mm using it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mm.h   |  1 -
 mm/hwpoison-inject.c | 11 ++++++-----
 mm/internal.h        |  1 +
 mm/memory-failure.c  | 15 ++++++++++-----
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 03e4841673d7..184f57cdab60 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3962,7 +3962,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
-extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
 #ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index d0548e382b6b..c9d653f51e45 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val)
 {
 	unsigned long pfn = val;
 	struct page *p;
-	struct page *hpage;
+	struct folio *folio;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val)
 		return -ENXIO;
 
 	p = pfn_to_page(pfn);
-	hpage = compound_head(p);
+	folio = page_folio(p);
 
 	if (!hwpoison_filter_enable)
 		goto inject;
 
-	shake_page(hpage);
+	shake_folio(folio);
 	/*
 	 * This implies unable to support non-LRU pages except free page.
 	 */
-	if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p))
+	if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) &&
+	    !is_free_buddy_page(p))
 		return 0;
 
 	/*
@@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val)
 	 * the targeted owner (or on a free page).
 	 * memory_failure() will redo the check reliably inside page lock.
 	 */
-	err = hwpoison_filter(hpage);
+	err = hwpoison_filter(&folio->page);
 	if (err)
 		return 0;
 
diff --git a/mm/internal.h b/mm/internal.h
index 13f4d6ccccdf..b900730f43c0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -855,6 +855,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
 /*
  * mm/memory-failure.c
  */
+void shake_folio(struct folio *folio);
 extern int hwpoison_filter(struct page *p);
 
 extern u32 hwpoison_filter_dev_major;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 40a8964954e5..27dc21063552 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -358,20 +358,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
  * Unknown page type encountered. Try to check whether it can turn PageLRU by
  * lru_add_drain_all.
  */
-void shake_page(struct page *p)
+void shake_folio(struct folio *folio)
 {
-	if (PageHuge(p))
+	if (folio_test_hugetlb(folio))
 		return;
 	/*
 	 * TODO: Could shrink slab caches here if a lightweight range-based
 	 * shrinker will be available.
 	 */
-	if (PageSlab(p))
+	if (folio_test_slab(folio))
 		return;
 
 	lru_add_drain_all();
 }
-EXPORT_SYMBOL_GPL(shake_page);
+EXPORT_SYMBOL_GPL(shake_folio);
+
+static void shake_page(struct page *page)
+{
+	shake_folio(page_folio(page));
+}
 
 static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
 		unsigned long address)
@@ -1639,7 +1644,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * shake_page() again to ensure that it's flushed.
 	 */
 	if (mlocked)
-		shake_page(hpage);
+		shake_folio(folio);
 
 	/*
 	 * Now that the dirty bit has been propagated to the
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 5/8] mm: Convert hugetlb_page_mapping_lock_write to folio
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
                   ` (3 preceding siblings ...)
  2024-02-29 21:20 ` [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio() Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-08  8:33   ` Miaohe Lin
  2024-02-29 21:20 ` [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio Matthew Wilcox (Oracle)
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle), linux-mm, Miaohe Lin, Naoya Horiguchi

The page is only used to get the mapping, so the folio will do just
as well.  Both callers already have a folio available, so this saves
a call to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/hugetlb.h | 6 +++---
 mm/hugetlb.c            | 6 +++---
 mm/memory-failure.c     | 2 +-
 mm/migrate.c            | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076..acb1096ecdaa 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -175,7 +175,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pud_t *pud);
 
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
 
 extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages[MAX_NUMNODES];
@@ -298,8 +298,8 @@ static inline unsigned long hugetlb_total_pages(void)
 	return 0;
 }
 
-static inline struct address_space *hugetlb_page_mapping_lock_write(
-							struct page *hpage)
+static inline struct address_space *hugetlb_folio_mapping_lock_write(
+							struct folio *folio)
 {
 	return NULL;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bb17e5c22759..0e464a8f1aa9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2178,13 +2178,13 @@ EXPORT_SYMBOL_GPL(PageHuge);
 /*
  * Find and lock address space (mapping) in write mode.
  *
- * Upon entry, the page is locked which means that page_mapping() is
+ * Upon entry, the folio is locked which means that folio_mapping() is
  * stable.  Due to locking order, we can only trylock_write.  If we can
  * not get the lock, simply return NULL to caller.
  */
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
 {
-	struct address_space *mapping = page_mapping(hpage);
+	struct address_space *mapping = folio_mapping(folio);
 
 	if (!mapping)
 		return mapping;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 27dc21063552..fe4959e994d0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1624,7 +1624,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		 * TTU_RMAP_LOCKED to indicate we have taken the lock
 		 * at this higher level.
 		 */
-		mapping = hugetlb_page_mapping_lock_write(hpage);
+		mapping = hugetlb_folio_mapping_lock_write(folio);
 		if (mapping) {
 			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
 			i_mmap_unlock_write(mapping);
diff --git a/mm/migrate.c b/mm/migrate.c
index 73a052a382f1..0aef867d600b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1425,7 +1425,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
 			 * semaphore in write mode here and set TTU_RMAP_LOCKED
 			 * to let lower levels know we have taken the lock.
 			 */
-			mapping = hugetlb_page_mapping_lock_write(&src->page);
+			mapping = hugetlb_folio_mapping_lock_write(src);
 			if (unlikely(!mapping))
 				goto unlock_put_anon;
 
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
                   ` (4 preceding siblings ...)
  2024-02-29 21:20 ` [PATCH 5/8] mm: Convert hugetlb_page_mapping_lock_write to folio Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-08  8:48   ` Miaohe Lin
  2024-02-29 21:20 ` [PATCH 7/8] mm/memory-failure: Convert hwpoison_user_mappings to take " Matthew Wilcox (Oracle)
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle), linux-mm, Miaohe Lin, Naoya Horiguchi

Saves dozens of calls to compound_head().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/memory-failure.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fe4959e994d0..74e87a0a792c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2189,7 +2189,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 int memory_failure(unsigned long pfn, int flags)
 {
 	struct page *p;
-	struct page *hpage;
+	struct folio *folio;
 	struct dev_pagemap *pgmap;
 	int res = 0;
 	unsigned long page_flags;
@@ -2277,8 +2277,8 @@ int memory_failure(unsigned long pfn, int flags)
 		}
 	}
 
-	hpage = compound_head(p);
-	if (PageTransHuge(hpage)) {
+	folio = page_folio(p);
+	if (folio_test_large(folio)) {
 		/*
 		 * The flag must be set after the refcount is bumped
 		 * otherwise it may race with THP split.
@@ -2292,12 +2292,13 @@ int memory_failure(unsigned long pfn, int flags)
 		 * or unhandlable page.  The refcount is bumped iff the
 		 * page is a valid handlable page.
 		 */
-		SetPageHasHWPoisoned(hpage);
+		folio_set_has_hwpoisoned(folio);
 		if (try_to_split_thp_page(p) < 0) {
 			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
 			goto unlock_mutex;
 		}
 		VM_BUG_ON_PAGE(!page_count(p), p);
+		folio = page_folio(p);
 	}
 
 	/*
@@ -2308,9 +2309,9 @@ int memory_failure(unsigned long pfn, int flags)
 	 * The check (unnecessarily) ignores LRU pages being isolated and
 	 * walked by the page reclaim code, however that's not a big loss.
 	 */
-	shake_page(p);
+	shake_folio(folio);
 
-	lock_page(p);
+	folio_lock(folio);
 
 	/*
 	 * We're only intended to deal with the non-Compound page here.
@@ -2318,11 +2319,11 @@ int memory_failure(unsigned long pfn, int flags)
 	 * race window. If this happens, we could try again to hopefully
 	 * handle the page next round.
 	 */
-	if (PageCompound(p)) {
+	if (folio_test_large(folio)) {
 		if (retry) {
 			ClearPageHWPoison(p);
-			unlock_page(p);
-			put_page(p);
+			folio_unlock(folio);
+			folio_put(folio);
 			flags &= ~MF_COUNT_INCREASED;
 			retry = false;
 			goto try_again;
@@ -2338,29 +2339,29 @@ int memory_failure(unsigned long pfn, int flags)
 	 * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
 	 * status correctly, we save a copy of the page flags at this time.
 	 */
-	page_flags = p->flags;
+	page_flags = folio->flags;
 
 	if (hwpoison_filter(p)) {
 		ClearPageHWPoison(p);
-		unlock_page(p);
-		put_page(p);
+		folio_unlock(folio);
+		folio_put(folio);
 		res = -EOPNOTSUPP;
 		goto unlock_mutex;
 	}
 
 	/*
-	 * __munlock_folio() may clear a writeback page's LRU flag without
-	 * page_lock. We need wait writeback completion for this page or it
-	 * may trigger vfs BUG while evict inode.
+	 * __munlock_folio() may clear a writeback folio's LRU flag without
+	 * the folio lock. We need to wait for writeback completion for this
+	 * folio or it may trigger a vfs BUG while evicting inode.
 	 */
-	if (!PageLRU(p) && !PageWriteback(p))
+	if (!folio_test_lru(folio) && !folio_test_writeback(folio))
 		goto identify_page_state;
 
 	/*
 	 * It's very difficult to mess with pages currently under IO
 	 * and in many cases impossible, so we just avoid it here.
 	 */
-	wait_on_page_writeback(p);
+	folio_wait_writeback(folio);
 
 	/*
 	 * Now take care of user space mappings.
@@ -2374,7 +2375,8 @@ int memory_failure(unsigned long pfn, int flags)
 	/*
 	 * Torn down by someone else?
 	 */
-	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+	if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
+	    folio->mapping == NULL) {
 		res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
 		goto unlock_page;
 	}
@@ -2384,7 +2386,7 @@ int memory_failure(unsigned long pfn, int flags)
 	mutex_unlock(&mf_mutex);
 	return res;
 unlock_page:
-	unlock_page(p);
+	folio_unlock(folio);
 unlock_mutex:
 	mutex_unlock(&mf_mutex);
 	return res;
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 7/8] mm/memory-failure: Convert hwpoison_user_mappings to take a folio
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
                   ` (5 preceding siblings ...)
  2024-02-29 21:20 ` [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-11 11:44   ` Miaohe Lin
  2024-02-29 21:20 ` [PATCH 8/8] mm/memory-failure: Add some folio conversions to unpoison_memory Matthew Wilcox (Oracle)
  2024-03-01  6:28 ` [PATCH 0/8] Some cleanups for memory-failure Miaohe Lin
  8 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle), linux-mm, Miaohe Lin, Naoya Horiguchi

Pass the folio from the callers, and use it throughout instead of hpage.
Saves dozens of calls to compound_head().
---
 mm/memory-failure.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 74e87a0a792c..56bc83372e30 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1559,24 +1559,24 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
-static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
-				  int flags, struct page *hpage)
+static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
+		unsigned long pfn, int flags)
 {
-	struct folio *folio = page_folio(hpage);
 	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
 	bool unmap_success;
 	int forcekill;
-	bool mlocked = PageMlocked(hpage);
+	bool mlocked = folio_test_mlocked(folio);
 
 	/*
 	 * Here we are interested only in user-mapped pages, so skip any
 	 * other types of pages.
 	 */
-	if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p))
+	if (folio_test_reserved(folio) || folio_test_slab(folio) ||
+	    folio_test_pgtable(folio) || folio_test_offline(folio))
 		return true;
-	if (!(PageLRU(hpage) || PageHuge(p)))
+	if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
 		return true;
 
 	/*
@@ -1586,7 +1586,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	if (!page_mapped(p))
 		return true;
 
-	if (PageSwapCache(p)) {
+	if (folio_test_swapcache(folio)) {
 		pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
 		ttu &= ~TTU_HWPOISON;
 	}
@@ -1597,11 +1597,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * XXX: the dirty test could be racy: set_page_dirty() may not always
 	 * be called inside page lock (it's recommended but not enforced).
 	 */
-	mapping = page_mapping(hpage);
-	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
+	mapping = folio_mapping(folio);
+	if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
 	    mapping_can_writeback(mapping)) {
-		if (page_mkclean(hpage)) {
-			SetPageDirty(hpage);
+		if (folio_mkclean(folio)) {
+			folio_set_dirty(folio);
 		} else {
 			ttu &= ~TTU_HWPOISON;
 			pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
@@ -1616,7 +1616,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 */
 	collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
 
-	if (PageHuge(hpage) && !PageAnon(hpage)) {
+	if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
 		/*
 		 * For hugetlb pages in shared mappings, try_to_unmap
 		 * could potentially call huge_pmd_unshare.  Because of
@@ -1656,7 +1656,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * use a more force-full uncatchable kill to prevent
 	 * any accesses to the poisoned memory.
 	 */
-	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
+	forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
 		    !unmap_success;
 	kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
 
@@ -2100,7 +2100,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 
 	page_flags = folio->flags;
 
-	if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) {
+	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
 		folio_unlock(folio);
 		return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 	}
@@ -2367,7 +2367,7 @@ int memory_failure(unsigned long pfn, int flags)
 	 * Now take care of user space mappings.
 	 * Abort on fail: __filemap_remove_folio() assumes unmapped page.
 	 */
-	if (!hwpoison_user_mappings(p, pfn, flags, p)) {
+	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
 		res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		goto unlock_page;
 	}
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 8/8] mm/memory-failure: Add some folio conversions to unpoison_memory
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
                   ` (6 preceding siblings ...)
  2024-02-29 21:20 ` [PATCH 7/8] mm/memory-failure: Convert hwpoison_user_mappings to take " Matthew Wilcox (Oracle)
@ 2024-02-29 21:20 ` Matthew Wilcox (Oracle)
  2024-03-11 11:29   ` Miaohe Lin
  2024-03-01  6:28 ` [PATCH 0/8] Some cleanups for memory-failure Miaohe Lin
  8 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-02-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle), linux-mm, Miaohe Lin, Naoya Horiguchi

Some of these folio APIs didn't exist when the unpoison_memory()
conversion was done originally.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/memory-failure.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 56bc83372e30..0806edb48b61 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2556,8 +2556,8 @@ int unpoison_memory(unsigned long pfn)
 		goto unlock_mutex;
 	}
 
-	if (folio_test_slab(folio) || PageTable(&folio->page) ||
-	    folio_test_reserved(folio) || PageOffline(&folio->page))
+	if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
+	    folio_test_reserved(folio) || folio_test_offline(folio))
 		goto unlock_mutex;
 
 	/*
@@ -2578,7 +2578,7 @@ int unpoison_memory(unsigned long pfn)
 
 	ghp = get_hwpoison_page(p, MF_UNPOISON);
 	if (!ghp) {
-		if (PageHuge(p)) {
+		if (folio_test_hugetlb(folio)) {
 			huge = true;
 			count = folio_free_raw_hwp(folio, false);
 			if (count == 0)
@@ -2594,7 +2594,7 @@ int unpoison_memory(unsigned long pfn)
 					 pfn, &unpoison_rs);
 		}
 	} else {
-		if (PageHuge(p)) {
+		if (folio_test_hugetlb(folio)) {
 			huge = true;
 			count = folio_free_raw_hwp(folio, false);
 			if (count == 0) {
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 0/8] Some cleanups for memory-failure
  2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
                   ` (7 preceding siblings ...)
  2024-02-29 21:20 ` [PATCH 8/8] mm/memory-failure: Add some folio conversions to unpoison_memory Matthew Wilcox (Oracle)
@ 2024-03-01  6:28 ` Miaohe Lin
  2024-03-01 12:40   ` Muhammad Usama Anjum
  8 siblings, 1 reply; 42+ messages in thread
From: Miaohe Lin @ 2024-03-01  6:28 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> This isn't my complete collection of patches to memory-failure, but
> they're the ones I have the most confidence in.  I haven't done any real
> testing of this beyond compiling & running on a system without trying to
> do any memory failure injections.  Is there a good test-suite I should
> run before submitting these kinds of patches?

mm_regression[1] should be a good alternative test-suite to run. It's written by
Naoya Horiguchi and available at https://github.com/nhoriguchi/mm_regression.

> 
> Matthew Wilcox (Oracle) (8):
>   mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
>   mm/memory-failure: Pass addr to __add_to_kill()
>   mm: Return the address from page_mapped_in_vma()
>   mm/memory-failure: Convert shake_page() to shake_folio()
>   mm: Convert hugetlb_page_mapping_lock_write to folio
>   mm/memory-failure: Convert memory_failure() to use a folio
>   mm/memory-failure: Convert hwpoison_user_mappings to take a folio
>   mm/memory-failure: Add some folio conversions to unpoison_memory

Thanks for your patches. I will review them as soon as possible.

> 
>  include/linux/hugetlb.h |   6 +-
>  include/linux/mm.h      |   1 -
>  include/linux/rmap.h    |   2 +-
>  mm/hugetlb.c            |   6 +-
>  mm/hwpoison-inject.c    |  11 +--
>  mm/internal.h           |   1 +
>  mm/memory-failure.c     | 144 +++++++++++++++++++++-------------------
>  mm/migrate.c            |   2 +-
>  mm/page_vma_mapped.c    |  14 ++--
>  9 files changed, 97 insertions(+), 90 deletions(-)
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 0/8] Some cleanups for memory-failure
  2024-03-01  6:28 ` [PATCH 0/8] Some cleanups for memory-failure Miaohe Lin
@ 2024-03-01 12:40   ` Muhammad Usama Anjum
  2024-03-04  1:55     ` Miaohe Lin
  0 siblings, 1 reply; 42+ messages in thread
From: Muhammad Usama Anjum @ 2024-03-01 12:40 UTC (permalink / raw)
  To: Miaohe Lin, Matthew Wilcox (Oracle)
  Cc: Muhammad Usama Anjum, linux-mm, Naoya Horiguchi, Andrew Morton

On 3/1/24 11:28 AM, Miaohe Lin wrote:
> On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
>> This isn't my complete collection of patches to memory-failure, but
>> they're the ones I have the most confidence in.  I haven't done any real
>> testing of this beyond compiling & running on a system without trying to
>> do any memory failure injections.  Is there a good test-suite I should
>> run before submitting these kinds of patches?
> 
> mm_regression[1] should be a good alternative test-suite to run. It's written by
> Naoya Horiguchi and available at https://github.com/nhoriguchi/mm_regression.
Any idea why these tests haven't been added to selftests or ltp?

> 
>>
>> Matthew Wilcox (Oracle) (8):
>>   mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
>>   mm/memory-failure: Pass addr to __add_to_kill()
>>   mm: Return the address from page_mapped_in_vma()
>>   mm/memory-failure: Convert shake_page() to shake_folio()
>>   mm: Convert hugetlb_page_mapping_lock_write to folio
>>   mm/memory-failure: Convert memory_failure() to use a folio
>>   mm/memory-failure: Convert hwpoison_user_mappings to take a folio
>>   mm/memory-failure: Add some folio conversions to unpoison_memory
> 
> Thanks for your patches. I will review them as soon as possible.
> 
>>
>>  include/linux/hugetlb.h |   6 +-
>>  include/linux/mm.h      |   1 -
>>  include/linux/rmap.h    |   2 +-
>>  mm/hugetlb.c            |   6 +-
>>  mm/hwpoison-inject.c    |  11 +--
>>  mm/internal.h           |   1 +
>>  mm/memory-failure.c     | 144 +++++++++++++++++++++-------------------
>>  mm/migrate.c            |   2 +-
>>  mm/page_vma_mapped.c    |  14 ++--
>>  9 files changed, 97 insertions(+), 90 deletions(-)
>>
> 
> 

-- 
BR,
Muhammad Usama Anjum


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 0/8] Some cleanups for memory-failure
  2024-03-01 12:40   ` Muhammad Usama Anjum
@ 2024-03-04  1:55     ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-04  1:55 UTC (permalink / raw)
  To: Muhammad Usama Anjum
  Cc: linux-mm, Naoya Horiguchi, Andrew Morton, Matthew Wilcox (Oracle)

On 2024/3/1 20:40, Muhammad Usama Anjum wrote:
> On 3/1/24 11:28 AM, Miaohe Lin wrote:
>> On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
>>> This isn't my complete collection of patches to memory-failure, but
>>> they're the ones I have the most confidence in.  I haven't done any real
>>> testing of this beyond compiling & running on a system without trying to
>>> do any memory failure injections.  Is there a good test-suite I should
>>> run before submitting these kinds of patches?
>>
>> mm_regression[1] should be a good alternative test-suite to run. It's written by
>> Naoya Horiguchi and available at https://github.com/nhoriguchi/mm_regression.
> Any idea why these tests haven't been added to selftests or ltp?

I would try to add some of them to selftests after I have learnt these testcases.

Thanks.

> 
>>
>>>
>>> Matthew Wilcox (Oracle) (8):
>>>   mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
>>>   mm/memory-failure: Pass addr to __add_to_kill()
>>>   mm: Return the address from page_mapped_in_vma()
>>>   mm/memory-failure: Convert shake_page() to shake_folio()
>>>   mm: Convert hugetlb_page_mapping_lock_write to folio
>>>   mm/memory-failure: Convert memory_failure() to use a folio
>>>   mm/memory-failure: Convert hwpoison_user_mappings to take a folio
>>>   mm/memory-failure: Add some folio conversions to unpoison_memory
>>
>> Thanks for your patches. I will review them as soon as possible.
>>
>>>
>>>  include/linux/hugetlb.h |   6 +-
>>>  include/linux/mm.h      |   1 -
>>>  include/linux/rmap.h    |   2 +-
>>>  mm/hugetlb.c            |   6 +-
>>>  mm/hwpoison-inject.c    |  11 +--
>>>  mm/internal.h           |   1 +
>>>  mm/memory-failure.c     | 144 +++++++++++++++++++++-------------------
>>>  mm/migrate.c            |   2 +-
>>>  mm/page_vma_mapped.c    |  14 ++--
>>>  9 files changed, 97 insertions(+), 90 deletions(-)
>>>
>>
>>
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-02-29 21:20 ` [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill Matthew Wilcox (Oracle)
@ 2024-03-04 12:09   ` Miaohe Lin
  2024-03-13  2:07   ` Jane Chu
  2024-03-19  0:36   ` Dan Williams
  2 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-04 12:09 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: linux-mm, Naoya Horiguchi, Dan Williams, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> Unify the KSM and DAX codepaths by calculating the addr in
> add_to_kill_fsdax() instead of telling __add_to_kill() to calculate it.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: Dan Williams <dan.j.williams@intel.com>

This patch looks good to me. Thanks.

Acked-by: Miaohe Lin <linmiaohe@huawei.com>

> ---
>  mm/memory-failure.c | 27 +++++++++------------------
>  1 file changed, 9 insertions(+), 18 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 9349948f1abf..9356227a50bb 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -416,21 +416,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
>   * not much we can do.	We just print a message and ignore otherwise.
>   */
>  
> -#define FSDAX_INVALID_PGOFF ULONG_MAX
> -
>  /*
>   * Schedule a process for later kill.
>   * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> - *
> - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
> - * filesystem with a memory failure handler has claimed the
> - * memory_failure event. In all other cases, page->index and
> - * page->mapping are sufficient for mapping the page back to its
> - * corresponding user virtual address.
>   */
>  static void __add_to_kill(struct task_struct *tsk, struct page *p,
>  			  struct vm_area_struct *vma, struct list_head *to_kill,
> -			  unsigned long ksm_addr, pgoff_t fsdax_pgoff)
> +			  unsigned long addr)
>  {
>  	struct to_kill *tk;
>  
> @@ -440,12 +432,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
>  		return;
>  	}
>  
> -	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
> -	if (is_zone_device_page(p)) {
> -		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
> -			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
> +	tk->addr = addr ? addr : page_address_in_vma(p, vma);
> +	if (is_zone_device_page(p))
>  		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
> -	} else
> +	else
>  		tk->size_shift = page_shift(compound_head(p));
>  
>  	/*
> @@ -475,7 +465,7 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
>  				  struct vm_area_struct *vma,
>  				  struct list_head *to_kill)
>  {
> -	__add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
> +	__add_to_kill(tsk, p, vma, to_kill, 0);
>  }
>  
>  #ifdef CONFIG_KSM
> @@ -493,10 +483,10 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
>  }
>  void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
>  		     struct vm_area_struct *vma, struct list_head *to_kill,
> -		     unsigned long ksm_addr)
> +		     unsigned long addr)
>  {
>  	if (!task_in_to_kill_list(to_kill, tsk))
> -		__add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
> +		__add_to_kill(tsk, p, vma, to_kill, addr);
>  }
>  #endif
>  /*
> @@ -670,7 +660,8 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
>  			      struct vm_area_struct *vma,
>  			      struct list_head *to_kill, pgoff_t pgoff)
>  {
> -	__add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
> +	unsigned long addr = vma_pgoff_address(pgoff, 1, vma);
> +	__add_to_kill(tsk, p, vma, to_kill, addr);
>  }
>  
>  /*
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/8] mm/memory-failure: Pass addr to __add_to_kill()
  2024-02-29 21:20 ` [PATCH 2/8] mm/memory-failure: Pass addr to __add_to_kill() Matthew Wilcox (Oracle)
@ 2024-03-04 12:10   ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-04 12:10 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: linux-mm, Naoya Horiguchi, Longlong Xia, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> Handle anon/file folios the same way as KSM & DAX folios by passing in
> the address.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: Longlong Xia <xialonglong1@huawei.com>

LGTM. Thanks.

Acked-by: Miaohe Lin <linmiaohe@huawei.com>

> ---
>  mm/memory-failure.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 9356227a50bb..7f8473c08ae3 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -432,7 +432,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
>  		return;
>  	}
>  
> -	tk->addr = addr ? addr : page_address_in_vma(p, vma);
> +	tk->addr = addr;
>  	if (is_zone_device_page(p))
>  		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
>  	else
> @@ -465,7 +465,8 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
>  				  struct vm_area_struct *vma,
>  				  struct list_head *to_kill)
>  {
> -	__add_to_kill(tsk, p, vma, to_kill, 0);
> +	unsigned long addr = page_address_in_vma(p, vma);
> +	__add_to_kill(tsk, p, vma, to_kill, addr);
>  }
>  
>  #ifdef CONFIG_KSM
> @@ -481,6 +482,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
>  
>  	return false;
>  }
> +
>  void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
>  		     struct vm_area_struct *vma, struct list_head *to_kill,
>  		     unsigned long addr)
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] mm: Return the address from page_mapped_in_vma()
  2024-02-29 21:20 ` [PATCH 3/8] mm: Return the address from page_mapped_in_vma() Matthew Wilcox (Oracle)
@ 2024-03-04 12:31   ` Miaohe Lin
  2024-03-05 20:09     ` Matthew Wilcox
  2024-03-06  8:17   ` Miaohe Lin
  1 sibling, 1 reply; 42+ messages in thread
From: Miaohe Lin @ 2024-03-04 12:31 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> The only user of this function calls page_address_in_vma() immediately
> after page_mapped_in_vma() calculates it and uses it to return true/false.
> Return the address instead, allowing memory-failure to skip the call
> to page_address_in_vma().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/rmap.h |  2 +-
>  mm/memory-failure.c  | 22 ++++++++++++++--------
>  mm/page_vma_mapped.c | 14 +++++++-------
>  3 files changed, 22 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index b7944a833668..ba027a4d9abf 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -698,7 +698,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>  
>  void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>  
> -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
> +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
>  
>  /*
>   * rmap_walk_control: To control rmap traversing for specific needs
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 7f8473c08ae3..40a8964954e5 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -462,10 +462,11 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
>  }
>  
>  static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
> -				  struct vm_area_struct *vma,
> -				  struct list_head *to_kill)
> +		struct vm_area_struct *vma, struct list_head *to_kill,
> +		unsigned long addr)
>  {
> -	unsigned long addr = page_address_in_vma(p, vma);
> +	if (addr == -EFAULT)
> +		return;

IIUC, this patch will change the semantic slightly. There is one corner case where page_mapped_in_vma()
returns true but page_address_in_vma() returns -EFAULT if mremap is done after the check. In that case,
SIGKILL will be sent to the user. But with this patch applied, SIGBUS will be sent to the user with address
before doing mremap. Or am I miss something?

Thanks.

>  	__add_to_kill(tsk, p, vma, to_kill, addr);
>  }
>  
> @@ -609,12 +610,13 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
>  			continue;
>  		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
>  					       pgoff, pgoff) {
> +			unsigned long addr;
> +
>  			vma = vmac->vma;
>  			if (vma->vm_mm != t->mm)
>  				continue;
> -			if (!page_mapped_in_vma(page, vma))
> -				continue;
> -			add_to_kill_anon_file(t, page, vma, to_kill);
> +			addr = page_mapped_in_vma(page, vma);
> +			add_to_kill_anon_file(t, page, vma, to_kill, addr);
>  		}
>  	}
>  	rcu_read_unlock();
> @@ -642,6 +644,8 @@ static void collect_procs_file(struct folio *folio, struct page *page,
>  			continue;
>  		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
>  				      pgoff) {
> +			unsigned long addr;
> +
>  			/*
>  			 * Send early kill signal to tasks where a vma covers
>  			 * the page but the corrupted page is not necessarily
> @@ -649,8 +653,10 @@ static void collect_procs_file(struct folio *folio, struct page *page,
>  			 * Assume applications who requested early kill want
>  			 * to be informed of all such data corruptions.
>  			 */
> -			if (vma->vm_mm == t->mm)
> -				add_to_kill_anon_file(t, page, vma, to_kill);
> +			if (vma->vm_mm != t->mm)
> +				continue;
> +			addr = page_address_in_vma(page, vma);
> +			add_to_kill_anon_file(t, page, vma, to_kill, addr);
>  		}
>  	}
>  	rcu_read_unlock();
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 74d2de15fb5e..e9e208b4ac4b 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -319,11 +319,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>   * @page: the page to test
>   * @vma: the VMA to test
>   *
> - * Returns 1 if the page is mapped into the page tables of the VMA, 0
> - * if the page is not mapped into the page tables of this VMA.  Only
> - * valid for normal file or anonymous VMAs.
> + * Return: If the page is mapped into the page tables of the VMA, the
> + * address that the page is mapped at.  -EFAULT if the page is not mapped.
> + * Only valid for normal file or anonymous VMAs.
>   */
> -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
> +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
>  {
>  	struct page_vma_mapped_walk pvmw = {
>  		.pfn = page_to_pfn(page),
> @@ -334,9 +334,9 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
>  
>  	pvmw.address = vma_address(page, vma);
>  	if (pvmw.address == -EFAULT)
> -		return 0;
> +		return -EFAULT;
>  	if (!page_vma_mapped_walk(&pvmw))
> -		return 0;
> +		return -EFAULT;
>  	page_vma_mapped_walk_done(&pvmw);
> -	return 1;
> +	return pvmw.address;
>  }
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] mm: Return the address from page_mapped_in_vma()
  2024-03-04 12:31   ` Miaohe Lin
@ 2024-03-05 20:09     ` Matthew Wilcox
  2024-03-06  8:10       ` Miaohe Lin
  0 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2024-03-05 20:09 UTC (permalink / raw)
  To: Miaohe Lin; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On Mon, Mar 04, 2024 at 08:31:56PM +0800, Miaohe Lin wrote:
> On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> > The only user of this function calls page_address_in_vma() immediately
> > after page_mapped_in_vma() calculates it and uses it to return true/false.
> > Return the address instead, allowing memory-failure to skip the call
> > to page_address_in_vma().
> 
> IIUC, this patch will change the semantic slightly. There is one corner
> case where page_mapped_in_vma() returns true but page_address_in_vma()
> returns -EFAULT if mremap is done after the check. In that case,
> SIGKILL will be sent to the user. But with this patch applied, SIGBUS
> will be sent to the user with address before doing mremap. Or am I
> miss something?

Isn't that an example of a race that userspace can't possibly rely on?
It can't observe where the kernel has got to in its processing of the
fault, so it's OK if we behave if the mremap() has happened before,
during or after the two calls.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] mm: Return the address from page_mapped_in_vma()
  2024-03-05 20:09     ` Matthew Wilcox
@ 2024-03-06  8:10       ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-06  8:10 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/6 4:09, Matthew Wilcox wrote:
> On Mon, Mar 04, 2024 at 08:31:56PM +0800, Miaohe Lin wrote:
>> On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
>>> The only user of this function calls page_address_in_vma() immediately
>>> after page_mapped_in_vma() calculates it and uses it to return true/false.
>>> Return the address instead, allowing memory-failure to skip the call
>>> to page_address_in_vma().
>>
>> IIUC, this patch will change the semantic slightly. There is one corner
>> case where page_mapped_in_vma() returns true but page_address_in_vma()
>> returns -EFAULT if mremap is done after the check. In that case,
>> SIGKILL will be sent to the user. But with this patch applied, SIGBUS
>> will be sent to the user with address before doing mremap. Or am I
>> miss something?
> 
> Isn't that an example of a race that userspace can't possibly rely on?

You're right. Userspace shouldn't possibly rely on it.
Thanks.

> It can't observe where the kernel has got to in its processing of the
> fault, so it's OK if we behave if the mremap() has happened before,
> during or after the two calls.
> .
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/8] mm: Return the address from page_mapped_in_vma()
  2024-02-29 21:20 ` [PATCH 3/8] mm: Return the address from page_mapped_in_vma() Matthew Wilcox (Oracle)
  2024-03-04 12:31   ` Miaohe Lin
@ 2024-03-06  8:17   ` Miaohe Lin
  1 sibling, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-06  8:17 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> The only user of this function calls page_address_in_vma() immediately
> after page_mapped_in_vma() calculates it and uses it to return true/false.
> Return the address instead, allowing memory-failure to skip the call
> to page_address_in_vma().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/rmap.h |  2 +-
>  mm/memory-failure.c  | 22 ++++++++++++++--------
>  mm/page_vma_mapped.c | 14 +++++++-------
>  3 files changed, 22 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index b7944a833668..ba027a4d9abf 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -698,7 +698,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>  
>  void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>  
> -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
> +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
>  
>  /*
>   * rmap_walk_control: To control rmap traversing for specific needs
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 7f8473c08ae3..40a8964954e5 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -462,10 +462,11 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
>  }
>  
>  static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
> -				  struct vm_area_struct *vma,
> -				  struct list_head *to_kill)
> +		struct vm_area_struct *vma, struct list_head *to_kill,
> +		unsigned long addr)
>  {
> -	unsigned long addr = page_address_in_vma(p, vma);
> +	if (addr == -EFAULT)
> +		return;
>  	__add_to_kill(tsk, p, vma, to_kill, addr);
>  }
>  
> @@ -609,12 +610,13 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
>  			continue;
>  		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
>  					       pgoff, pgoff) {
> +			unsigned long addr;
> +
>  			vma = vmac->vma;
>  			if (vma->vm_mm != t->mm)
>  				continue;
> -			if (!page_mapped_in_vma(page, vma))
> -				continue;
> -			add_to_kill_anon_file(t, page, vma, to_kill);
> +			addr = page_mapped_in_vma(page, vma);
> +			add_to_kill_anon_file(t, page, vma, to_kill, addr);
>  		}
>  	}
>  	rcu_read_unlock();
> @@ -642,6 +644,8 @@ static void collect_procs_file(struct folio *folio, struct page *page,
>  			continue;
>  		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
>  				      pgoff) {
> +			unsigned long addr;
> +
>  			/*
>  			 * Send early kill signal to tasks where a vma covers
>  			 * the page but the corrupted page is not necessarily
> @@ -649,8 +653,10 @@ static void collect_procs_file(struct folio *folio, struct page *page,
>  			 * Assume applications who requested early kill want
>  			 * to be informed of all such data corruptions.
>  			 */
> -			if (vma->vm_mm == t->mm)
> -				add_to_kill_anon_file(t, page, vma, to_kill);
> +			if (vma->vm_mm != t->mm)
> +				continue;
> +			addr = page_address_in_vma(page, vma);
> +			add_to_kill_anon_file(t, page, vma, to_kill, addr);
>  		}
>  	}
>  	rcu_read_unlock();
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 74d2de15fb5e..e9e208b4ac4b 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -319,11 +319,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>   * @page: the page to test
>   * @vma: the VMA to test
>   *
> - * Returns 1 if the page is mapped into the page tables of the VMA, 0
> - * if the page is not mapped into the page tables of this VMA.  Only
> - * valid for normal file or anonymous VMAs.
> + * Return: If the page is mapped into the page tables of the VMA, the
> + * address that the page is mapped at.  -EFAULT if the page is not mapped.
> + * Only valid for normal file or anonymous VMAs.
>   */
> -int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
> +unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
>  {
>  	struct page_vma_mapped_walk pvmw = {
>  		.pfn = page_to_pfn(page),
> @@ -334,9 +334,9 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
>  
>  	pvmw.address = vma_address(page, vma);
>  	if (pvmw.address == -EFAULT)
> -		return 0;
> +		return -EFAULT;
>  	if (!page_vma_mapped_walk(&pvmw))
> -		return 0;
> +		return -EFAULT;
>  	page_vma_mapped_walk_done(&pvmw);
> -	return 1;
> +	return pvmw.address;

page_mapped_in_vma is only called by collect_procs_anon. Will it be better to make it under CONFIG_MEMORY_FAILURE?

Anyway, this patch looks good to me. Thanks.

Acked-by: Miaohe Lin <linmiaohe@huawei.com>




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio()
  2024-02-29 21:20 ` [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio() Matthew Wilcox (Oracle)
@ 2024-03-06  9:31   ` Miaohe Lin
  2024-04-08 15:36     ` Matthew Wilcox
  0 siblings, 1 reply; 42+ messages in thread
From: Miaohe Lin @ 2024-03-06  9:31 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> Removes two calls to compound_head().  Move the prototype to
> internal.h; we definitely don't want code outside mm using it.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/mm.h   |  1 -
>  mm/hwpoison-inject.c | 11 ++++++-----
>  mm/internal.h        |  1 +
>  mm/memory-failure.c  | 15 ++++++++++-----
>  4 files changed, 17 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 03e4841673d7..184f57cdab60 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3962,7 +3962,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>  extern int memory_failure(unsigned long pfn, int flags);
>  extern void memory_failure_queue_kick(int cpu);
>  extern int unpoison_memory(unsigned long pfn);
> -extern void shake_page(struct page *p);
>  extern atomic_long_t num_poisoned_pages __read_mostly;
>  extern int soft_offline_page(unsigned long pfn, int flags);
>  #ifdef CONFIG_MEMORY_FAILURE
> diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
> index d0548e382b6b..c9d653f51e45 100644
> --- a/mm/hwpoison-inject.c
> +++ b/mm/hwpoison-inject.c
> @@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val)
>  {
>  	unsigned long pfn = val;
>  	struct page *p;
> -	struct page *hpage;
> +	struct folio *folio;
>  	int err;
>  
>  	if (!capable(CAP_SYS_ADMIN))
> @@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val)
>  		return -ENXIO;
>  
>  	p = pfn_to_page(pfn);
> -	hpage = compound_head(p);
> +	folio = page_folio(p);
>  
>  	if (!hwpoison_filter_enable)
>  		goto inject;
>  
> -	shake_page(hpage);
> +	shake_folio(folio);
>  	/*
>  	 * This implies unable to support non-LRU pages except free page.
>  	 */
> -	if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p))
> +	if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) &&
> +	    !is_free_buddy_page(p))
>  		return 0;
>  
>  	/*
> @@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val)
>  	 * the targeted owner (or on a free page).
>  	 * memory_failure() will redo the check reliably inside page lock.
>  	 */
> -	err = hwpoison_filter(hpage);
> +	err = hwpoison_filter(&folio->page);
>  	if (err)
>  		return 0;
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index 13f4d6ccccdf..b900730f43c0 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -855,6 +855,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
>  /*
>   * mm/memory-failure.c
>   */
> +void shake_folio(struct folio *folio);
>  extern int hwpoison_filter(struct page *p);
>  
>  extern u32 hwpoison_filter_dev_major;
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 40a8964954e5..27dc21063552 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -358,20 +358,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
>   * Unknown page type encountered. Try to check whether it can turn PageLRU by
>   * lru_add_drain_all.
>   */
> -void shake_page(struct page *p)
> +void shake_folio(struct folio *folio)

It might not be a good idea to convert shake_page() into shake_folio(). shake_page() can
be called without holding page refcnt. So the below race could happen:

 hwpoison_inject
  folio = page_folio(p); --> assume p is 4k page [1]
  shake_folio(folio)
   folio_test_slab(folio)
    test_bit(PG_slab, folio_flags(folio, FOLIO_PF_NO_TAIL) [2]
     VM_BUG_ON_PGFLAGS(PageTail(page), page) in folio_flags

Between [1] and [2], page can become PageTail of a THP. So the VM_BUG_ON_PGFLAGS will trigger.
Or am I miss something?

Thanks.

>  {
> -	if (PageHuge(p))
> +	if (folio_test_hugetlb(folio))
>  		return;
>  	/*
>  	 * TODO: Could shrink slab caches here if a lightweight range-based
>  	 * shrinker will be available.
>  	 */
> -	if (PageSlab(p))
> +	if (folio_test_slab(folio))
>  		return;
>  
>  	lru_add_drain_all();
>  }
> -EXPORT_SYMBOL_GPL(shake_page);
> +EXPORT_SYMBOL_GPL(shake_folio);
> +
> +static void shake_page(struct page *page)
> +{
> +	shake_folio(page_folio(page));
> +}
>  
>  static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
>  		unsigned long address)
> @@ -1639,7 +1644,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
>  	 * shake_page() again to ensure that it's flushed.
>  	 */
>  	if (mlocked)
> -		shake_page(hpage);
> +		shake_folio(folio);
>  
>  	/*
>  	 * Now that the dirty bit has been propagated to the
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 5/8] mm: Convert hugetlb_page_mapping_lock_write to folio
  2024-02-29 21:20 ` [PATCH 5/8] mm: Convert hugetlb_page_mapping_lock_write to folio Matthew Wilcox (Oracle)
@ 2024-03-08  8:33   ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-08  8:33 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> The page is only used to get the mapping, so the folio will do just
> as well.  Both callers already have a folio available, so this saves
> a call to compound_head().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Looks good to me. Thanks.

Acked-by: Miaohe Lin <linmiaohe@huawei.com>

> ---
>  include/linux/hugetlb.h | 6 +++---
>  mm/hugetlb.c            | 6 +++---
>  mm/memory-failure.c     | 2 +-
>  mm/migrate.c            | 2 +-
>  4 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 77b30a8c6076..acb1096ecdaa 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -175,7 +175,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
>  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
>  		      unsigned long addr, pud_t *pud);
>  
> -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
> +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
>  
>  extern int sysctl_hugetlb_shm_group;
>  extern struct list_head huge_boot_pages[MAX_NUMNODES];
> @@ -298,8 +298,8 @@ static inline unsigned long hugetlb_total_pages(void)
>  	return 0;
>  }
>  
> -static inline struct address_space *hugetlb_page_mapping_lock_write(
> -							struct page *hpage)
> +static inline struct address_space *hugetlb_folio_mapping_lock_write(
> +							struct folio *folio)
>  {
>  	return NULL;
>  }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index bb17e5c22759..0e464a8f1aa9 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2178,13 +2178,13 @@ EXPORT_SYMBOL_GPL(PageHuge);
>  /*
>   * Find and lock address space (mapping) in write mode.
>   *
> - * Upon entry, the page is locked which means that page_mapping() is
> + * Upon entry, the folio is locked which means that folio_mapping() is
>   * stable.  Due to locking order, we can only trylock_write.  If we can
>   * not get the lock, simply return NULL to caller.
>   */
> -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
> +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
>  {
> -	struct address_space *mapping = page_mapping(hpage);
> +	struct address_space *mapping = folio_mapping(folio);
>  
>  	if (!mapping)
>  		return mapping;
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 27dc21063552..fe4959e994d0 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1624,7 +1624,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
>  		 * TTU_RMAP_LOCKED to indicate we have taken the lock
>  		 * at this higher level.
>  		 */
> -		mapping = hugetlb_page_mapping_lock_write(hpage);
> +		mapping = hugetlb_folio_mapping_lock_write(folio);
>  		if (mapping) {
>  			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
>  			i_mmap_unlock_write(mapping);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 73a052a382f1..0aef867d600b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1425,7 +1425,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
>  			 * semaphore in write mode here and set TTU_RMAP_LOCKED
>  			 * to let lower levels know we have taken the lock.
>  			 */
> -			mapping = hugetlb_page_mapping_lock_write(&src->page);
> +			mapping = hugetlb_folio_mapping_lock_write(src);
>  			if (unlikely(!mapping))
>  				goto unlock_put_anon;
>  
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-02-29 21:20 ` [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio Matthew Wilcox (Oracle)
@ 2024-03-08  8:48   ` Miaohe Lin
  2024-03-11 12:31     ` Matthew Wilcox
  0 siblings, 1 reply; 42+ messages in thread
From: Miaohe Lin @ 2024-03-08  8:48 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> Saves dozens of calls to compound_head().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/memory-failure.c | 40 +++++++++++++++++++++-------------------
>  1 file changed, 21 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index fe4959e994d0..74e87a0a792c 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -2189,7 +2189,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
>  int memory_failure(unsigned long pfn, int flags)
>  {
>  	struct page *p;
> -	struct page *hpage;
> +	struct folio *folio;
>  	struct dev_pagemap *pgmap;
>  	int res = 0;
>  	unsigned long page_flags;
> @@ -2277,8 +2277,8 @@ int memory_failure(unsigned long pfn, int flags)
>  		}
>  	}
>  
> -	hpage = compound_head(p);
> -	if (PageTransHuge(hpage)) {
> +	folio = page_folio(p);
> +	if (folio_test_large(folio)) {
>  		/*
>  		 * The flag must be set after the refcount is bumped
>  		 * otherwise it may race with THP split.
> @@ -2292,12 +2292,13 @@ int memory_failure(unsigned long pfn, int flags)
>  		 * or unhandlable page.  The refcount is bumped iff the
>  		 * page is a valid handlable page.
>  		 */
> -		SetPageHasHWPoisoned(hpage);
> +		folio_set_has_hwpoisoned(folio);
>  		if (try_to_split_thp_page(p) < 0) {
>  			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
>  			goto unlock_mutex;
>  		}
>  		VM_BUG_ON_PAGE(!page_count(p), p);
> +		folio = page_folio(p);
>  	}
>  
>  	/*
> @@ -2308,9 +2309,9 @@ int memory_failure(unsigned long pfn, int flags)
>  	 * The check (unnecessarily) ignores LRU pages being isolated and
>  	 * walked by the page reclaim code, however that's not a big loss.
>  	 */
> -	shake_page(p);
> +	shake_folio(folio);
>  
> -	lock_page(p);
> +	folio_lock(folio);
>  
>  	/*
>  	 * We're only intended to deal with the non-Compound page here.
> @@ -2318,11 +2319,11 @@ int memory_failure(unsigned long pfn, int flags)
>  	 * race window. If this happens, we could try again to hopefully
>  	 * handle the page next round.
>  	 */
> -	if (PageCompound(p)) {
> +	if (folio_test_large(folio)) {

folio_test_large() only checks whether PG_head is set but PageCompound() also checks PageTail().
So folio_test_large() and PageCompound() are not equivalent?

Thanks.



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 8/8] mm/memory-failure: Add some folio conversions to unpoison_memory
  2024-02-29 21:20 ` [PATCH 8/8] mm/memory-failure: Add some folio conversions to unpoison_memory Matthew Wilcox (Oracle)
@ 2024-03-11 11:29   ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-11 11:29 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> Some of these folio APIs didn't exist when the unpoison_memory()
> conversion was done originally.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

LGTM. Thanks.

Acked-by: Miaohe Lin <linmiaohe@huawei.com>

> ---
>  mm/memory-failure.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 56bc83372e30..0806edb48b61 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -2556,8 +2556,8 @@ int unpoison_memory(unsigned long pfn)
>  		goto unlock_mutex;
>  	}
>  
> -	if (folio_test_slab(folio) || PageTable(&folio->page) ||
> -	    folio_test_reserved(folio) || PageOffline(&folio->page))
> +	if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
> +	    folio_test_reserved(folio) || folio_test_offline(folio))
>  		goto unlock_mutex;
>  
>  	/*
> @@ -2578,7 +2578,7 @@ int unpoison_memory(unsigned long pfn)
>  
>  	ghp = get_hwpoison_page(p, MF_UNPOISON);
>  	if (!ghp) {
> -		if (PageHuge(p)) {
> +		if (folio_test_hugetlb(folio)) {
>  			huge = true;
>  			count = folio_free_raw_hwp(folio, false);
>  			if (count == 0)
> @@ -2594,7 +2594,7 @@ int unpoison_memory(unsigned long pfn)
>  					 pfn, &unpoison_rs);
>  		}
>  	} else {
> -		if (PageHuge(p)) {
> +		if (folio_test_hugetlb(folio)) {
>  			huge = true;
>  			count = folio_free_raw_hwp(folio, false);
>  			if (count == 0) {
> 



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 7/8] mm/memory-failure: Convert hwpoison_user_mappings to take a folio
  2024-02-29 21:20 ` [PATCH 7/8] mm/memory-failure: Convert hwpoison_user_mappings to take " Matthew Wilcox (Oracle)
@ 2024-03-11 11:44   ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-11 11:44 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> Pass the folio from the callers, and use it throughout instead of hpage.
> Saves dozens of calls to compound_head().
> ---
>  mm/memory-failure.c | 30 +++++++++++++++---------------
>  1 file changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 74e87a0a792c..56bc83372e30 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1559,24 +1559,24 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
>   * Do all that is necessary to remove user space mappings. Unmap
>   * the pages and send SIGBUS to the processes if the data was dirty.
>   */
> -static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> -				  int flags, struct page *hpage)
> +static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> +		unsigned long pfn, int flags)

hwpoison_user_mappings() is called with folio refcnt held, so I think it should be safe to use folio directly.

>  {
> -	struct folio *folio = page_folio(hpage);
>  	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
>  	struct address_space *mapping;
>  	LIST_HEAD(tokill);
>  	bool unmap_success;
>  	int forcekill;
> -	bool mlocked = PageMlocked(hpage);
> +	bool mlocked = folio_test_mlocked(folio);
>  
>  	/*
>  	 * Here we are interested only in user-mapped pages, so skip any
>  	 * other types of pages.
>  	 */
> -	if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p))
> +	if (folio_test_reserved(folio) || folio_test_slab(folio) ||
> +	    folio_test_pgtable(folio) || folio_test_offline(folio))
>  		return true;
> -	if (!(PageLRU(hpage) || PageHuge(p)))
> +	if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
>  		return true;
>  
>  	/*
> @@ -1586,7 +1586,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
>  	if (!page_mapped(p))
>  		return true;
>  
> -	if (PageSwapCache(p)) {
> +	if (folio_test_swapcache(folio)) {
>  		pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
>  		ttu &= ~TTU_HWPOISON;
>  	}
> @@ -1597,11 +1597,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
>  	 * XXX: the dirty test could be racy: set_page_dirty() may not always
>  	 * be called inside page lock (it's recommended but not enforced).
>  	 */
> -	mapping = page_mapping(hpage);
> -	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
> +	mapping = folio_mapping(folio);
> +	if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
>  	    mapping_can_writeback(mapping)) {
> -		if (page_mkclean(hpage)) {
> -			SetPageDirty(hpage);
> +		if (folio_mkclean(folio)) {
> +			folio_set_dirty(folio);
>  		} else {
>  			ttu &= ~TTU_HWPOISON;
>  			pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
> @@ -1616,7 +1616,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
>  	 */
>  	collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
>  
> -	if (PageHuge(hpage) && !PageAnon(hpage)) {
> +	if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
>  		/*
>  		 * For hugetlb pages in shared mappings, try_to_unmap
>  		 * could potentially call huge_pmd_unshare.  Because of
> @@ -1656,7 +1656,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
>  	 * use a more force-full uncatchable kill to prevent
>  	 * any accesses to the poisoned memory.
>  	 */
> -	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
> +	forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
>  		    !unmap_success;
>  	kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
>  
> @@ -2100,7 +2100,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
>  
>  	page_flags = folio->flags;
>  
> -	if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) {
> +	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
>  		folio_unlock(folio);
>  		return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
>  	}
> @@ -2367,7 +2367,7 @@ int memory_failure(unsigned long pfn, int flags)
>  	 * Now take care of user space mappings.
>  	 * Abort on fail: __filemap_remove_folio() assumes unmapped page.
>  	 */
> -	if (!hwpoison_user_mappings(p, pfn, flags, p)) {
> +	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {

folio should always be quivalent to p in normal 4k page case.

>  		res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
>  		goto unlock_page;
>  	}
> 

This patch looks good to me. Thanks.

Acked-by: Miaohe Lin <linmiaohe@huawei.com>



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-08  8:48   ` Miaohe Lin
@ 2024-03-11 12:31     ` Matthew Wilcox
  2024-03-12  7:07       ` Miaohe Lin
  0 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2024-03-11 12:31 UTC (permalink / raw)
  To: Miaohe Lin; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On Fri, Mar 08, 2024 at 04:48:33PM +0800, Miaohe Lin wrote:
> On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
> > @@ -2277,8 +2277,8 @@ int memory_failure(unsigned long pfn, int flags)
> >  		}
> >  	}
> >  
> > -	hpage = compound_head(p);
> > -	if (PageTransHuge(hpage)) {
> > +	folio = page_folio(p);
> > +	if (folio_test_large(folio)) {
> >  		/*
> >  		 * The flag must be set after the refcount is bumped
> >  		 * otherwise it may race with THP split.
[...]
> > @@ -2318,11 +2319,11 @@ int memory_failure(unsigned long pfn, int flags)
> >  	 * race window. If this happens, we could try again to hopefully
> >  	 * handle the page next round.
> >  	 */
> > -	if (PageCompound(p)) {
> > +	if (folio_test_large(folio)) {
> 
> folio_test_large() only checks whether PG_head is set but PageCompound() also checks PageTail().
> So folio_test_large() and PageCompound() are not equivalent?

Assuming we have a refcount on this page so it can't be simultaneously
split/freed/whatever, these three sequences are equivalent:

1	if (PageCompound(p))

2	struct page *head = compound_head(p);
2	if (PageHead(head))

3	struct folio *folio = page_folio(p);
3	if (folio_test_large(folio))



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-11 12:31     ` Matthew Wilcox
@ 2024-03-12  7:07       ` Miaohe Lin
  2024-03-12 14:14         ` Matthew Wilcox
  0 siblings, 1 reply; 42+ messages in thread
From: Miaohe Lin @ 2024-03-12  7:07 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak

On 2024/3/11 20:31, Matthew Wilcox wrote:
> On Fri, Mar 08, 2024 at 04:48:33PM +0800, Miaohe Lin wrote:
>> On 2024/3/1 5:20, Matthew Wilcox (Oracle) wrote:
>>> @@ -2277,8 +2277,8 @@ int memory_failure(unsigned long pfn, int flags)
>>>  		}
>>>  	}
>>>  
>>> -	hpage = compound_head(p);
>>> -	if (PageTransHuge(hpage)) {
>>> +	folio = page_folio(p);
>>> +	if (folio_test_large(folio)) {
>>>  		/*
>>>  		 * The flag must be set after the refcount is bumped
>>>  		 * otherwise it may race with THP split.
> [...]
>>> @@ -2318,11 +2319,11 @@ int memory_failure(unsigned long pfn, int flags)
>>>  	 * race window. If this happens, we could try again to hopefully
>>>  	 * handle the page next round.
>>>  	 */
>>> -	if (PageCompound(p)) {
>>> +	if (folio_test_large(folio)) {
>>
>> folio_test_large() only checks whether PG_head is set but PageCompound() also checks PageTail().
>> So folio_test_large() and PageCompound() are not equivalent?
> 
> Assuming we have a refcount on this page so it can't be simultaneously
> split/freed/whatever, these three sequences are equivalent:

If page is stable after page refcnt is held, I agree below three sequences are equivalent.

> 
> 1	if (PageCompound(p))
> 
> 2	struct page *head = compound_head(p);
> 2	if (PageHead(head))
> 
> 3	struct folio *folio = page_folio(p);
> 3	if (folio_test_large(folio))
> 
> .
> 

But please see below commit:

"""
commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
Author: Andi Kleen <ak@linux.intel.com>
Date:   Wed Aug 6 16:06:49 2014 -0700

    hwpoison: fix race with changing page during offlining

    When a hwpoison page is locked it could change state due to parallel
    modifications.  The original compound page can be torn down and then
    this 4k page becomes part of a differently-size compound page is is a
    standalone regular page.

    Check after the lock if the page is still the same compound page.

    We could go back, grab the new head page and try again but it should be
    quite rare, so I thought this was safest.  A retry loop would be more
    difficult to test and may have more side effects.

    The hwpoison code by design only tries to handle cases that are
    reasonably common in workloads, as visible in page-flags.

    I'm not really that concerned about handling this (likely rare case),
    just not crashing on it.

    Signed-off-by: Andi Kleen <ak@linux.intel.com>
    Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a013bc94ebbe..44c6bd201d3a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1172,6 +1172,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)

 	lock_page(hpage);

+	/*
+	 * The page could have changed compound pages during the locking.
+	 * If this happens just bail out.
+	 */
+	if (compound_head(p) != hpage) {
+		action_result(pfn, "different compound page after locking", IGNORED);
+		res = -EBUSY;
+		goto out;
+	}
+
 	/*
 	 * We use page flags to determine what action should be taken, but
 	 * the flags can be modified by the error containment action.  One

"""

It says a page could still change to a differently-size compound page due to parallel
modifications even if extra page refcnt is held and page is locked. But this commit is
early (ten years ago) things might have changed. Any thoughts?

Thanks.



^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-12  7:07       ` Miaohe Lin
@ 2024-03-12 14:14         ` Matthew Wilcox
  2024-03-13  1:23           ` Jane Chu
  0 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2024-03-12 14:14 UTC (permalink / raw)
  To: Miaohe Lin; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak

On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
> On 2024/3/11 20:31, Matthew Wilcox wrote:
> > Assuming we have a refcount on this page so it can't be simultaneously
> > split/freed/whatever, these three sequences are equivalent:
> 
> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
> 
> > 
> > 1	if (PageCompound(p))
> > 
> > 2	struct page *head = compound_head(p);
> > 2	if (PageHead(head))
> > 
> > 3	struct folio *folio = page_folio(p);
> > 3	if (folio_test_large(folio))
> > 
> > .
> > 
> 
> But please see below commit:
> 
> """
> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
> Author: Andi Kleen <ak@linux.intel.com>
> Date:   Wed Aug 6 16:06:49 2014 -0700
> 
>     hwpoison: fix race with changing page during offlining
> 
>     When a hwpoison page is locked it could change state due to parallel
>     modifications.  The original compound page can be torn down and then
>     this 4k page becomes part of a differently-size compound page is is a
>     standalone regular page.
> 
>     Check after the lock if the page is still the same compound page.

I can't speak to what the rules were ten years ago, but this is not
true now.  Compound pages cannot be split if you hold a refcount.
Since we don't track a per-page refcount, we wouldn't know which of
the split pages to give the excess refcount to.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-12 14:14         ` Matthew Wilcox
@ 2024-03-13  1:23           ` Jane Chu
  2024-03-14  2:34             ` Miaohe Lin
  2024-03-15  8:32             ` Miaohe Lin
  0 siblings, 2 replies; 42+ messages in thread
From: Jane Chu @ 2024-03-13  1:23 UTC (permalink / raw)
  To: Matthew Wilcox, Miaohe Lin
  Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak, Jane Chu

On 3/12/2024 7:14 AM, Matthew Wilcox wrote:

> On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
>> On 2024/3/11 20:31, Matthew Wilcox wrote:
>>> Assuming we have a refcount on this page so it can't be simultaneously
>>> split/freed/whatever, these three sequences are equivalent:
>> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
>>
>>> 1	if (PageCompound(p))
>>>
>>> 2	struct page *head = compound_head(p);
>>> 2	if (PageHead(head))
>>>
>>> 3	struct folio *folio = page_folio(p);
>>> 3	if (folio_test_large(folio))
>>>
>>> .
>>>
>> But please see below commit:
>>
>> """
>> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
>> Author: Andi Kleen <ak@linux.intel.com>
>> Date:   Wed Aug 6 16:06:49 2014 -0700
>>
>>      hwpoison: fix race with changing page during offlining
>>
>>      When a hwpoison page is locked it could change state due to parallel
>>      modifications.  The original compound page can be torn down and then
>>      this 4k page becomes part of a differently-size compound page is is a
>>      standalone regular page.
>>
>>      Check after the lock if the page is still the same compound page.
> I can't speak to what the rules were ten years ago, but this is not
> true now.  Compound pages cannot be split if you hold a refcount.
> Since we don't track a per-page refcount, we wouldn't know which of
> the split pages to give the excess refcount to.

I noticed this recently

  * GUP pin and PG_locked transferred to @page. Rest subpages can be 
freed if
  * they are not mapped.
  *
  * Returns 0 if the hugepage is split successfully.
  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from 
under
  * us.
  */
int split_huge_page_to_list(struct page *page, struct list_head *list)
{

I have a test case with poisoned shmem THP page that was mlocked and

GUP pinned (FOLL_LONGTERM|FOLL_WRITE), but the split succeeded.

thanks,

-jane



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-02-29 21:20 ` [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill Matthew Wilcox (Oracle)
  2024-03-04 12:09   ` Miaohe Lin
@ 2024-03-13  2:07   ` Jane Chu
  2024-03-13  3:23     ` Matthew Wilcox
  2024-03-19  0:36   ` Dan Williams
  2 siblings, 1 reply; 42+ messages in thread
From: Jane Chu @ 2024-03-13  2:07 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle), Andrew Morton, ruansy.fnst
  Cc: linux-mm, Miaohe Lin, Naoya Horiguchi, Dan Williams, Jane Chu

On 2/29/2024 1:20 PM, Matthew Wilcox (Oracle) wrote:

> Unify the KSM and DAX codepaths by calculating the addr in
> add_to_kill_fsdax() instead of telling __add_to_kill() to calculate it.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: Dan Williams <dan.j.williams@intel.com>
> ---
>   mm/memory-failure.c | 27 +++++++++------------------
>   1 file changed, 9 insertions(+), 18 deletions(-)
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 9349948f1abf..9356227a50bb 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -416,21 +416,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
>    * not much we can do.	We just print a message and ignore otherwise.
>    */
>   
> -#define FSDAX_INVALID_PGOFF ULONG_MAX
> -
>   /*
>    * Schedule a process for later kill.
>    * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> - *
> - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
> - * filesystem with a memory failure handler has claimed the
> - * memory_failure event. In all other cases, page->index and
> - * page->mapping are sufficient for mapping the page back to its
> - * corresponding user virtual address.
>    */
>   static void __add_to_kill(struct task_struct *tsk, struct page *p,
>   			  struct vm_area_struct *vma, struct list_head *to_kill,
> -			  unsigned long ksm_addr, pgoff_t fsdax_pgoff)
> +			  unsigned long addr)
>   {
>   	struct to_kill *tk;
>   
> @@ -440,12 +432,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
>   		return;
>   	}
>   
> -	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
> -	if (is_zone_device_page(p)) {
> -		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
> -			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
> +	tk->addr = addr ? addr : page_address_in_vma(p, vma);
> +	if (is_zone_device_page(p))
>   		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);

Not sure about the simplification.  The fsdax special calculation was added by

commit c36e2024957120566efd99395b5c8cc95b5175c1
Author: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Date:   Fri Jun 3 13:37:29 2022 +0800

     mm: introduce mf_dax_kill_procs() for fsdax case

     This new function is a variant of mf_generic_kill_procs that accepts a
     file, offset pair instead of a struct to support multiple files sharing a
     DAX mapping.  It is intended to be called by the file systems as part of
     the memory_failure handler after the file system performed a reverse
     mapping from the storage address to the file and file offset.

     Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.co
m
[..]

@@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,

         }

         tk->addr = page_address_in_vma(p, vma);

-       if (is_zone_device_page(p))

-               tk->size_shift = dev_pagemap_mapping_shift(p, vma);

-       else

+       if (is_zone_device_page(p)) {

+               /*

+                * Since page->mapping is not used for fsdax, we need

+                * calculate the address based on the vma.

+                */

+               if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)

+                       tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);

+               tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);

+       } else

                 tk->size_shift = page_shift(compound_head(p));

Looks like it was to deal away with this check

unsignedlongpage_address_in_vma 
<https://elixir.bootlin.com/linux/v6.8/C/ident/page_address_in_vma>(structpage 
<https://elixir.bootlin.com/linux/v6.8/C/ident/page>*page 
<https://elixir.bootlin.com/linux/v6.8/C/ident/page>,structvm_area_struct 
<https://elixir.bootlin.com/linux/v6.8/C/ident/vm_area_struct>*vma 
<https://elixir.bootlin.com/linux/v6.8/C/ident/vma>)
{ [..]

}elseif(vma <https://elixir.bootlin.com/linux/v6.8/C/ident/vma>->vm_file 
<https://elixir.bootlin.com/linux/v6.8/C/ident/vm_file>->f_mapping 
<https://elixir.bootlin.com/linux/v6.8/C/ident/f_mapping>!=folio 
<https://elixir.bootlin.com/linux/v6.8/C/ident/folio>->mapping 
<https://elixir.bootlin.com/linux/v6.8/C/ident/mapping>){
return-EFAULT <https://elixir.bootlin.com/linux/v6.8/C/ident/EFAULT>;
}

But, by providing nr=1 in vma_pgoff_address(fsdax_pgoff, 1, vma), it also avoided dealing with

wrap around, which perhaps doesn't matter? perhaps noone runs fsdax on 32-bit machine?

Add Shiyang Ruan for comment.

thanks,

-jane



> -	} else
> +	else
>   		tk->size_shift = page_shift(compound_head(p));
>   
>   	/*
> @@ -475,7 +465,7 @@ static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
>   				  struct vm_area_struct *vma,
>   				  struct list_head *to_kill)
>   {
> -	__add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
> +	__add_to_kill(tsk, p, vma, to_kill, 0);
>   }
>   
>   #ifdef CONFIG_KSM
> @@ -493,10 +483,10 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
>   }
>   void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
>   		     struct vm_area_struct *vma, struct list_head *to_kill,
> -		     unsigned long ksm_addr)
> +		     unsigned long addr)
>   {
>   	if (!task_in_to_kill_list(to_kill, tsk))
> -		__add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
> +		__add_to_kill(tsk, p, vma, to_kill, addr);
>   }
>   #endif
>   /*
> @@ -670,7 +660,8 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
>   			      struct vm_area_struct *vma,
>   			      struct list_head *to_kill, pgoff_t pgoff)
>   {
> -	__add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
> +	unsigned long addr = vma_pgoff_address(pgoff, 1, vma);
> +	__add_to_kill(tsk, p, vma, to_kill, addr);
>   }
>   
>   /*


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-03-13  2:07   ` Jane Chu
@ 2024-03-13  3:23     ` Matthew Wilcox
  2024-03-13 18:11       ` Jane Chu
  0 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2024-03-13  3:23 UTC (permalink / raw)
  To: Jane Chu
  Cc: Andrew Morton, ruansy.fnst, linux-mm, Miaohe Lin,
	Naoya Horiguchi, Dan Williams

On Tue, Mar 12, 2024 at 07:07:10PM -0700, Jane Chu wrote:
> On 2/29/2024 1:20 PM, Matthew Wilcox (Oracle) wrote:
> 
> > Unify the KSM and DAX codepaths by calculating the addr in
> > add_to_kill_fsdax() instead of telling __add_to_kill() to calculate it.
> > 
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > ---
> >   mm/memory-failure.c | 27 +++++++++------------------
> >   1 file changed, 9 insertions(+), 18 deletions(-)
> > 
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 9349948f1abf..9356227a50bb 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -416,21 +416,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
> >    * not much we can do.	We just print a message and ignore otherwise.
> >    */
> > -#define FSDAX_INVALID_PGOFF ULONG_MAX
> > -
> >   /*
> >    * Schedule a process for later kill.
> >    * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> > - *
> > - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
> > - * filesystem with a memory failure handler has claimed the
> > - * memory_failure event. In all other cases, page->index and
> > - * page->mapping are sufficient for mapping the page back to its
> > - * corresponding user virtual address.
> >    */
> >   static void __add_to_kill(struct task_struct *tsk, struct page *p,
> >   			  struct vm_area_struct *vma, struct list_head *to_kill,
> > -			  unsigned long ksm_addr, pgoff_t fsdax_pgoff)
> > +			  unsigned long addr)
> >   {
> >   	struct to_kill *tk;
> > @@ -440,12 +432,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
> >   		return;
> >   	}
> > -	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
> > -	if (is_zone_device_page(p)) {
> > -		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
> > -			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
> > +	tk->addr = addr ? addr : page_address_in_vma(p, vma);
> > +	if (is_zone_device_page(p))
> >   		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
> 
> Not sure about the simplification.  The fsdax special calculation was added by
> 
> commit c36e2024957120566efd99395b5c8cc95b5175c1
> Author: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> Date:   Fri Jun 3 13:37:29 2022 +0800
> 
>     mm: introduce mf_dax_kill_procs() for fsdax case
> 
>     This new function is a variant of mf_generic_kill_procs that accepts a
>     file, offset pair instead of a struct to support multiple files sharing a
>     DAX mapping.  It is intended to be called by the file systems as part of
>     the memory_failure handler after the file system performed a reverse
>     mapping from the storage address to the file and file offset.
> 
>     Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.co
> m
> [..]
> 
> @@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
> 
>         }
> 
>         tk->addr = page_address_in_vma(p, vma);
> 
> -       if (is_zone_device_page(p))
> 
> -               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> 
> -       else
> 
> +       if (is_zone_device_page(p)) {
> 
> +               /*
> 
> +                * Since page->mapping is not used for fsdax, we need
> 
> +                * calculate the address based on the vma.
> 
> +                */
> 
> +               if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
> 
> +                       tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
> 
> +               tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
> 
> +       } else
> 
>                 tk->size_shift = page_shift(compound_head(p));
> 
> Looks like it was to deal away with this check
> 
> unsignedlongpage_address_in_vma <https://elixir.bootlin.com/linux/v6.8/C/ident/page_address_in_vma>(structpage
> <https://elixir.bootlin.com/linux/v6.8/C/ident/page>*page
> <https://elixir.bootlin.com/linux/v6.8/C/ident/page>,structvm_area_struct
> <https://elixir.bootlin.com/linux/v6.8/C/ident/vm_area_struct>*vma
> <https://elixir.bootlin.com/linux/v6.8/C/ident/vma>)
> { [..]
> 
> }elseif(vma <https://elixir.bootlin.com/linux/v6.8/C/ident/vma>->vm_file
> <https://elixir.bootlin.com/linux/v6.8/C/ident/vm_file>->f_mapping
> <https://elixir.bootlin.com/linux/v6.8/C/ident/f_mapping>!=folio
> <https://elixir.bootlin.com/linux/v6.8/C/ident/folio>->mapping
> <https://elixir.bootlin.com/linux/v6.8/C/ident/mapping>){
> return-EFAULT <https://elixir.bootlin.com/linux/v6.8/C/ident/EFAULT>;
> }
> 
> But, by providing nr=1 in vma_pgoff_address(fsdax_pgoff, 1, vma), it also avoided dealing with
> 
> wrap around, which perhaps doesn't matter? perhaps noone runs fsdax on 32-bit machine?

I don't understand what you're saying is wrong with this patch.
All I'm doing (apart from the renaming of ksm_addr to addr) is moving
the vma_pgoff_address() call from __add_to_kill() to its caller.
Is there some reason this doesn't work?


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-03-13  3:23     ` Matthew Wilcox
@ 2024-03-13 18:11       ` Jane Chu
  2024-03-14  3:51         ` Matthew Wilcox
  0 siblings, 1 reply; 42+ messages in thread
From: Jane Chu @ 2024-03-13 18:11 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, ruansy.fnst, linux-mm, Miaohe Lin,
	Naoya Horiguchi, Dan Williams, Jane Chu

On 3/12/2024 8:23 PM, Matthew Wilcox wrote:

> On Tue, Mar 12, 2024 at 07:07:10PM -0700, Jane Chu wrote:
>> On 2/29/2024 1:20 PM, Matthew Wilcox (Oracle) wrote:
>>
>>> Unify the KSM and DAX codepaths by calculating the addr in
>>> add_to_kill_fsdax() instead of telling __add_to_kill() to calculate it.
>>>
>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>> ---
>>>    mm/memory-failure.c | 27 +++++++++------------------
>>>    1 file changed, 9 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>>> index 9349948f1abf..9356227a50bb 100644
>>> --- a/mm/memory-failure.c
>>> +++ b/mm/memory-failure.c
>>> @@ -416,21 +416,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
>>>     * not much we can do.	We just print a message and ignore otherwise.
>>>     */
>>> -#define FSDAX_INVALID_PGOFF ULONG_MAX
>>> -
>>>    /*
>>>     * Schedule a process for later kill.
>>>     * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
>>> - *
>>> - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
>>> - * filesystem with a memory failure handler has claimed the
>>> - * memory_failure event. In all other cases, page->index and
>>> - * page->mapping are sufficient for mapping the page back to its
>>> - * corresponding user virtual address.
>>>     */
>>>    static void __add_to_kill(struct task_struct *tsk, struct page *p,
>>>    			  struct vm_area_struct *vma, struct list_head *to_kill,
>>> -			  unsigned long ksm_addr, pgoff_t fsdax_pgoff)
>>> +			  unsigned long addr)
>>>    {
>>>    	struct to_kill *tk;
>>> @@ -440,12 +432,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
>>>    		return;
>>>    	}
>>> -	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
>>> -	if (is_zone_device_page(p)) {
>>> -		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
>>> -			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
>>> +	tk->addr = addr ? addr : page_address_in_vma(p, vma);
>>> +	if (is_zone_device_page(p))
>>>    		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
>> Not sure about the simplification.  The fsdax special calculation was added by
>>
>> commit c36e2024957120566efd99395b5c8cc95b5175c1
>> Author: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>> Date:   Fri Jun 3 13:37:29 2022 +0800
>>
>>      mm: introduce mf_dax_kill_procs() for fsdax case
>>
>>      This new function is a variant of mf_generic_kill_procs that accepts a
>>      file, offset pair instead of a struct to support multiple files sharing a
>>      DAX mapping.  It is intended to be called by the file systems as part of
>>      the memory_failure handler after the file system performed a reverse
>>      mapping from the storage address to the file and file offset.
>>
>>      Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.co
>> m
>> [..]
>>
>> @@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
>>
>>          }
>>
>>          tk->addr = page_address_in_vma(p, vma);
>>
>> -       if (is_zone_device_page(p))
>>
>> -               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
>>
>> -       else
>>
>> +       if (is_zone_device_page(p)) {
>>
>> +               /*
>>
>> +                * Since page->mapping is not used for fsdax, we need
>>
>> +                * calculate the address based on the vma.
>>
>> +                */
>>
>> +               if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
>>
>> +                       tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
>>
>> +               tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
>>
>> +       } else
>>
>>                  tk->size_shift = page_shift(compound_head(p));
>>
>> Looks like it was to deal away with this check
>>
>> unsignedlongpage_address_in_vma <https://elixir.bootlin.com/linux/v6.8/C/ident/page_address_in_vma>(structpage
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/page>*page
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/page>,structvm_area_struct
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/vm_area_struct>*vma
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/vma>)
>> { [..]
>>
>> }elseif(vma <https://elixir.bootlin.com/linux/v6.8/C/ident/vma>->vm_file
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/vm_file>->f_mapping
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/f_mapping>!=folio
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/folio>->mapping
>> <https://elixir.bootlin.com/linux/v6.8/C/ident/mapping>){
>> return-EFAULT <https://elixir.bootlin.com/linux/v6.8/C/ident/EFAULT>;
>> }
>>
>> But, by providing nr=1 in vma_pgoff_address(fsdax_pgoff, 1, vma), it also avoided dealing with
>>
>> wrap around, which perhaps doesn't matter? perhaps noone runs fsdax on 32-bit machine?
> I don't understand what you're saying is wrong with this patch.
> All I'm doing (apart from the renaming of ksm_addr to addr) is moving
> the vma_pgoff_address() call from __add_to_kill() to its caller.
> Is there some reason this doesn't work?

Sorry for not being clear.   What I meant to say is that, before the 
patch, page_address_in_vma(p, vma) is the means for determining 
tk->addr, except for fsdax when filesystems made a claim of wanting to 
participate in the UE handling, in that case, 
vma_pgoff_address(fsdax_pgoff, 1, vma) is used instead. The difference 
between the two means is that, although the former eventually calls the 
latter _if_ vma->vm_file->f_mapping exists and is stable, what I am 
unclear from Shiyang Ruan's earlier patch is that, he seems to be 
concerning a case where f_mapping is not reliable, hence his patch went 
straight to call vma_pgoff_address(fsdax_pgoff, 1, vma), and on top of 
that, providing nr=1 to ignore the address wrap around case.

So I don't know whether removing the fsdax special case is okay.

thanks!

-jane




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-13  1:23           ` Jane Chu
@ 2024-03-14  2:34             ` Miaohe Lin
  2024-03-14 18:15               ` Jane Chu
  2024-03-15  8:32             ` Miaohe Lin
  1 sibling, 1 reply; 42+ messages in thread
From: Miaohe Lin @ 2024-03-14  2:34 UTC (permalink / raw)
  To: Jane Chu, Matthew Wilcox; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak

On 2024/3/13 9:23, Jane Chu wrote:
> On 3/12/2024 7:14 AM, Matthew Wilcox wrote:
> 
>> On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
>>> On 2024/3/11 20:31, Matthew Wilcox wrote:
>>>> Assuming we have a refcount on this page so it can't be simultaneously
>>>> split/freed/whatever, these three sequences are equivalent:
>>> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
>>>
>>>> 1    if (PageCompound(p))
>>>>
>>>> 2    struct page *head = compound_head(p);
>>>> 2    if (PageHead(head))
>>>>
>>>> 3    struct folio *folio = page_folio(p);
>>>> 3    if (folio_test_large(folio))
>>>>
>>>> .
>>>>
>>> But please see below commit:
>>>
>>> """
>>> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
>>> Author: Andi Kleen <ak@linux.intel.com>
>>> Date:   Wed Aug 6 16:06:49 2014 -0700
>>>
>>>      hwpoison: fix race with changing page during offlining
>>>
>>>      When a hwpoison page is locked it could change state due to parallel
>>>      modifications.  The original compound page can be torn down and then
>>>      this 4k page becomes part of a differently-size compound page is is a
>>>      standalone regular page.
>>>
>>>      Check after the lock if the page is still the same compound page.
>> I can't speak to what the rules were ten years ago, but this is not
>> true now.  Compound pages cannot be split if you hold a refcount.
>> Since we don't track a per-page refcount, we wouldn't know which of
>> the split pages to give the excess refcount to.
> 
> I noticed this recently
> 
>  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
>  * they are not mapped.
>  *
>  * Returns 0 if the hugepage is split successfully.
>  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
>  * us.
>  */
> int split_huge_page_to_list(struct page *page, struct list_head *list)
> {
> 
> I have a test case with poisoned shmem THP page that was mlocked and
> 
> GUP pinned (FOLL_LONGTERM|FOLL_WRITE), but the split succeeded.

Thanks for points this out. Compound pages can be split even if extra refcnt is held. So folio_test_large
check is not stable if we hold a refcnt now? Will it introduce some obscure races?

Except from that, I think a page cannot become a subpage of a THP when extra refcnt is held now. So below code can be removed.
Any thought?

	/*
	 * We're only intended to deal with the non-Compound page here.
	 * However, the page could have changed compound pages due to
	 * race window. If this happens, we could try again to hopefully
	 * handle the page next round.
	 */
	if (PageCompound(p)) {
		if (retry) {
			ClearPageHWPoison(p);
			unlock_page(p);
			put_page(p);
			flags &= ~MF_COUNT_INCREASED;
			retry = false;
			goto try_again;
		}
		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
		goto unlock_page;
	}

Thanks.

> 
> thanks,
> 
> -jane
> 
> .



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-03-13 18:11       ` Jane Chu
@ 2024-03-14  3:51         ` Matthew Wilcox
  2024-03-14 17:54           ` Jane Chu
  0 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2024-03-14  3:51 UTC (permalink / raw)
  To: Jane Chu
  Cc: Andrew Morton, ruansy.fnst, linux-mm, Miaohe Lin,
	Naoya Horiguchi, Dan Williams

On Wed, Mar 13, 2024 at 11:11:04AM -0700, Jane Chu wrote:
> On 3/12/2024 8:23 PM, Matthew Wilcox wrote:
> > I don't understand what you're saying is wrong with this patch.
> > All I'm doing (apart from the renaming of ksm_addr to addr) is moving
> > the vma_pgoff_address() call from __add_to_kill() to its caller.
> > Is there some reason this doesn't work?
> 
> Sorry for not being clear.   What I meant to say is that, before the patch,
> page_address_in_vma(p, vma) is the means for determining tk->addr, except
> for fsdax when filesystems made a claim of wanting to participate in the UE
> handling, in that case, vma_pgoff_address(fsdax_pgoff, 1, vma) is used
> instead. The difference between the two means is that, although the former
> eventually calls the latter _if_ vma->vm_file->f_mapping exists and is
> stable, what I am unclear from Shiyang Ruan's earlier patch is that, he
> seems to be concerning a case where f_mapping is not reliable, hence his
> patch went straight to call vma_pgoff_address(fsdax_pgoff, 1, vma), and on
> top of that, providing nr=1 to ignore the address wrap around case.
> 
> So I don't know whether removing the fsdax special case is okay.

I don't think I'm removing the fsdax special case, just moving it.

If I subtract out the renaming from this patch, it looks like:

 	tk->addr = addr ? addr : page_address_in_vma(p, vma);
-	if (is_zone_device_page(p)) {
-		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
-			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+	if (is_zone_device_page(p)) {
...
 {
-	__add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
+	unsigned long addr = vma_pgoff_address(pgoff, 1, vma);
+	__add_to_kill(tsk, p, vma, to_kill, addr);
 }

So instead of passing in '0' as the addr from add_to_kill_fsdax(),
we do the address lookup in add_to_kill_fsdax() and pass it in.
That means we don't call page_address_in_vma() for DAX because
addr is not 0.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-03-14  3:51         ` Matthew Wilcox
@ 2024-03-14 17:54           ` Jane Chu
  0 siblings, 0 replies; 42+ messages in thread
From: Jane Chu @ 2024-03-14 17:54 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, ruansy.fnst, linux-mm, Miaohe Lin,
	Naoya Horiguchi, Dan Williams

On 3/13/2024 8:51 PM, Matthew Wilcox wrote:

> On Wed, Mar 13, 2024 at 11:11:04AM -0700, Jane Chu wrote:
>> On 3/12/2024 8:23 PM, Matthew Wilcox wrote:
>>> I don't understand what you're saying is wrong with this patch.
>>> All I'm doing (apart from the renaming of ksm_addr to addr) is moving
>>> the vma_pgoff_address() call from __add_to_kill() to its caller.
>>> Is there some reason this doesn't work?
>> Sorry for not being clear.   What I meant to say is that, before the patch,
>> page_address_in_vma(p, vma) is the means for determining tk->addr, except
>> for fsdax when filesystems made a claim of wanting to participate in the UE
>> handling, in that case, vma_pgoff_address(fsdax_pgoff, 1, vma) is used
>> instead. The difference between the two means is that, although the former
>> eventually calls the latter _if_ vma->vm_file->f_mapping exists and is
>> stable, what I am unclear from Shiyang Ruan's earlier patch is that, he
>> seems to be concerning a case where f_mapping is not reliable, hence his
>> patch went straight to call vma_pgoff_address(fsdax_pgoff, 1, vma), and on
>> top of that, providing nr=1 to ignore the address wrap around case.
>>
>> So I don't know whether removing the fsdax special case is okay.
> I don't think I'm removing the fsdax special case, just moving it.
>
> If I subtract out the renaming from this patch, it looks like:
>
>   	tk->addr = addr ? addr : page_address_in_vma(p, vma);
> -	if (is_zone_device_page(p)) {
> -		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
> -			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
> +	if (is_zone_device_page(p)) {
> ...
>   {
> -	__add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
> +	unsigned long addr = vma_pgoff_address(pgoff, 1, vma);
> +	__add_to_kill(tsk, p, vma, to_kill, addr);
>   }
>
> So instead of passing in '0' as the addr from add_to_kill_fsdax(),
> we do the address lookup in add_to_kill_fsdax() and pass it in.
> That means we don't call page_address_in_vma() for DAX because
> addr is not 0.

You're right!  I overlooked the addr being passed in in fsdax case is 
different.

Thanks for taking the time explaining to me.

Reviewed-by: Jane Chu <jane.chu@oracle.com>

-jane



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-14  2:34             ` Miaohe Lin
@ 2024-03-14 18:15               ` Jane Chu
  2024-03-15  6:25                 ` Miaohe Lin
  0 siblings, 1 reply; 42+ messages in thread
From: Jane Chu @ 2024-03-14 18:15 UTC (permalink / raw)
  To: Miaohe Lin, Matthew Wilcox; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak

On 3/13/2024 7:34 PM, Miaohe Lin wrote:

> On 2024/3/13 9:23, Jane Chu wrote:
>> On 3/12/2024 7:14 AM, Matthew Wilcox wrote:
>>
>>> On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
>>>> On 2024/3/11 20:31, Matthew Wilcox wrote:
>>>>> Assuming we have a refcount on this page so it can't be simultaneously
>>>>> split/freed/whatever, these three sequences are equivalent:
>>>> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
>>>>
>>>>> 1    if (PageCompound(p))
>>>>>
>>>>> 2    struct page *head = compound_head(p);
>>>>> 2    if (PageHead(head))
>>>>>
>>>>> 3    struct folio *folio = page_folio(p);
>>>>> 3    if (folio_test_large(folio))
>>>>>
>>>>> .
>>>>>
>>>> But please see below commit:
>>>>
>>>> """
>>>> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
>>>> Author: Andi Kleen <ak@linux.intel.com>
>>>> Date:   Wed Aug 6 16:06:49 2014 -0700
>>>>
>>>>       hwpoison: fix race with changing page during offlining
>>>>
>>>>       When a hwpoison page is locked it could change state due to parallel
>>>>       modifications.  The original compound page can be torn down and then
>>>>       this 4k page becomes part of a differently-size compound page is is a
>>>>       standalone regular page.
>>>>
>>>>       Check after the lock if the page is still the same compound page.
>>> I can't speak to what the rules were ten years ago, but this is not
>>> true now.  Compound pages cannot be split if you hold a refcount.
>>> Since we don't track a per-page refcount, we wouldn't know which of
>>> the split pages to give the excess refcount to.
>> I noticed this recently
>>
>>   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
>>   * they are not mapped.
>>   *
>>   * Returns 0 if the hugepage is split successfully.
>>   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
>>   * us.
>>   */
>> int split_huge_page_to_list(struct page *page, struct list_head *list)
>> {
>>
>> I have a test case with poisoned shmem THP page that was mlocked and
>>
>> GUP pinned (FOLL_LONGTERM|FOLL_WRITE), but the split succeeded.
> Thanks for points this out. Compound pages can be split even if extra refcnt is held. So folio_test_large
> check is not stable if we hold a refcnt now? Will it introduce some obscure races?
>
> Except from that, I think a page cannot become a subpage of a THP when extra refcnt is held now. So below code can be removed.
> Any thought?
>
> 	/*
> 	 * We're only intended to deal with the non-Compound page here.
> 	 * However, the page could have changed compound pages due to
> 	 * race window. If this happens, we could try again to hopefully
> 	 * handle the page next round.
> 	 */
> 	if (PageCompound(p)) {
> 		if (retry) {
> 			ClearPageHWPoison(p);
> 			unlock_page(p);
> 			put_page(p);
> 			flags &= ~MF_COUNT_INCREASED;
> 			retry = false;
> 			goto try_again;
> 		}
> 		res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
> 		goto unlock_page;
> 	}
Not sure of what scenario it was meant to deal with.  How about adding a 
warning instead of removal? It'll be interesting to see how the warning 
got triggered. But if after a while nothing happens, then remove it.

thanks!

-jane

>
> Thanks.
>
>> thanks,
>>
>> -jane
>>
>> .


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-14 18:15               ` Jane Chu
@ 2024-03-15  6:25                 ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-15  6:25 UTC (permalink / raw)
  To: Jane Chu, Matthew Wilcox; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak

On 2024/3/15 2:15, Jane Chu wrote:
> On 3/13/2024 7:34 PM, Miaohe Lin wrote:
> 
>> On 2024/3/13 9:23, Jane Chu wrote:
>>> On 3/12/2024 7:14 AM, Matthew Wilcox wrote:
>>>
>>>> On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
>>>>> On 2024/3/11 20:31, Matthew Wilcox wrote:
>>>>>> Assuming we have a refcount on this page so it can't be simultaneously
>>>>>> split/freed/whatever, these three sequences are equivalent:
>>>>> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
>>>>>
>>>>>> 1    if (PageCompound(p))
>>>>>>
>>>>>> 2    struct page *head = compound_head(p);
>>>>>> 2    if (PageHead(head))
>>>>>>
>>>>>> 3    struct folio *folio = page_folio(p);
>>>>>> 3    if (folio_test_large(folio))
>>>>>>
>>>>>> .
>>>>>>
>>>>> But please see below commit:
>>>>>
>>>>> """
>>>>> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
>>>>> Author: Andi Kleen <ak@linux.intel.com>
>>>>> Date:   Wed Aug 6 16:06:49 2014 -0700
>>>>>
>>>>>       hwpoison: fix race with changing page during offlining
>>>>>
>>>>>       When a hwpoison page is locked it could change state due to parallel
>>>>>       modifications.  The original compound page can be torn down and then
>>>>>       this 4k page becomes part of a differently-size compound page is is a
>>>>>       standalone regular page.
>>>>>
>>>>>       Check after the lock if the page is still the same compound page.
>>>> I can't speak to what the rules were ten years ago, but this is not
>>>> true now.  Compound pages cannot be split if you hold a refcount.
>>>> Since we don't track a per-page refcount, we wouldn't know which of
>>>> the split pages to give the excess refcount to.
>>> I noticed this recently
>>>
>>>   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
>>>   * they are not mapped.
>>>   *
>>>   * Returns 0 if the hugepage is split successfully.
>>>   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
>>>   * us.
>>>   */
>>> int split_huge_page_to_list(struct page *page, struct list_head *list)
>>> {
>>>
>>> I have a test case with poisoned shmem THP page that was mlocked and
>>>
>>> GUP pinned (FOLL_LONGTERM|FOLL_WRITE), but the split succeeded.
>> Thanks for points this out. Compound pages can be split even if extra refcnt is held. So folio_test_large
>> check is not stable if we hold a refcnt now? Will it introduce some obscure races?
>>
>> Except from that, I think a page cannot become a subpage of a THP when extra refcnt is held now. So below code can be removed.
>> Any thought?
>>
>>     /*
>>      * We're only intended to deal with the non-Compound page here.
>>      * However, the page could have changed compound pages due to
>>      * race window. If this happens, we could try again to hopefully
>>      * handle the page next round.
>>      */
>>     if (PageCompound(p)) {
>>         if (retry) {
>>             ClearPageHWPoison(p);
>>             unlock_page(p);
>>             put_page(p);
>>             flags &= ~MF_COUNT_INCREASED;
>>             retry = false;
>>             goto try_again;
>>         }
>>         res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
>>         goto unlock_page;
>>     }
> Not sure of what scenario it was meant to deal with.  How about adding a warning instead of removal? It'll be interesting to see how the warning got triggered. But if after a while nothing happens, then remove it.

This sounds like a good alternative. Thanks.




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-13  1:23           ` Jane Chu
  2024-03-14  2:34             ` Miaohe Lin
@ 2024-03-15  8:32             ` Miaohe Lin
  2024-03-15 19:22               ` Jane Chu
  1 sibling, 1 reply; 42+ messages in thread
From: Miaohe Lin @ 2024-03-15  8:32 UTC (permalink / raw)
  To: Jane Chu; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak, Matthew Wilcox

On 2024/3/13 9:23, Jane Chu wrote:
> On 3/12/2024 7:14 AM, Matthew Wilcox wrote:
> 
>> On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
>>> On 2024/3/11 20:31, Matthew Wilcox wrote:
>>>> Assuming we have a refcount on this page so it can't be simultaneously
>>>> split/freed/whatever, these three sequences are equivalent:
>>> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
>>>
>>>> 1    if (PageCompound(p))
>>>>
>>>> 2    struct page *head = compound_head(p);
>>>> 2    if (PageHead(head))
>>>>
>>>> 3    struct folio *folio = page_folio(p);
>>>> 3    if (folio_test_large(folio))
>>>>
>>>> .
>>>>
>>> But please see below commit:
>>>
>>> """
>>> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
>>> Author: Andi Kleen <ak@linux.intel.com>
>>> Date:   Wed Aug 6 16:06:49 2014 -0700
>>>
>>>      hwpoison: fix race with changing page during offlining
>>>
>>>      When a hwpoison page is locked it could change state due to parallel
>>>      modifications.  The original compound page can be torn down and then
>>>      this 4k page becomes part of a differently-size compound page is is a
>>>      standalone regular page.
>>>
>>>      Check after the lock if the page is still the same compound page.
>> I can't speak to what the rules were ten years ago, but this is not
>> true now.  Compound pages cannot be split if you hold a refcount.
>> Since we don't track a per-page refcount, we wouldn't know which of
>> the split pages to give the excess refcount to.
> 
> I noticed this recently
> 
>  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
>  * they are not mapped.
>  *
>  * Returns 0 if the hugepage is split successfully.
>  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
>  * us.
>  */
> int split_huge_page_to_list(struct page *page, struct list_head *list)
> {
> 
> I have a test case with poisoned shmem THP page that was mlocked and
> 
> GUP pinned (FOLL_LONGTERM|FOLL_WRITE), but the split succeeded.

Can you elaborate your test case a little bit more detail? There is a check in split_huge_page_to_list():

/* Racy check whether the huge page can be split */
bool can_split_folio(struct folio *folio, int *pextra_pins)
{
	int extra_pins;

	/* Additional pins from page cache */
	if (folio_test_anon(folio))
		extra_pins = folio_test_swapcache(folio) ?
				folio_nr_pages(folio) : 0;
	else
		extra_pins = folio_nr_pages(folio);
	if (pextra_pins)
		*pextra_pins = extra_pins;
	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
}

So a large folio can only be split if only one extra page refcnt is held. It means large folio won't be split from
under us if we hold an page refcnt. Or am I miss something?

Thanks.

> 
> thanks,
> 
> -jane
> 
> .



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-15  8:32             ` Miaohe Lin
@ 2024-03-15 19:22               ` Jane Chu
  2024-03-18  2:28                 ` Miaohe Lin
  0 siblings, 1 reply; 42+ messages in thread
From: Jane Chu @ 2024-03-15 19:22 UTC (permalink / raw)
  To: Miaohe Lin; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak, Matthew Wilcox

On 3/15/2024 1:32 AM, Miaohe Lin wrote:

> On 2024/3/13 9:23, Jane Chu wrote:
>> On 3/12/2024 7:14 AM, Matthew Wilcox wrote:
>>
>>> On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
>>>> On 2024/3/11 20:31, Matthew Wilcox wrote:
>>>>> Assuming we have a refcount on this page so it can't be simultaneously
>>>>> split/freed/whatever, these three sequences are equivalent:
>>>> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
>>>>
>>>>> 1    if (PageCompound(p))
>>>>>
>>>>> 2    struct page *head = compound_head(p);
>>>>> 2    if (PageHead(head))
>>>>>
>>>>> 3    struct folio *folio = page_folio(p);
>>>>> 3    if (folio_test_large(folio))
>>>>>
>>>>> .
>>>>>
>>>> But please see below commit:
>>>>
>>>> """
>>>> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
>>>> Author: Andi Kleen <ak@linux.intel.com>
>>>> Date:   Wed Aug 6 16:06:49 2014 -0700
>>>>
>>>>       hwpoison: fix race with changing page during offlining
>>>>
>>>>       When a hwpoison page is locked it could change state due to parallel
>>>>       modifications.  The original compound page can be torn down and then
>>>>       this 4k page becomes part of a differently-size compound page is is a
>>>>       standalone regular page.
>>>>
>>>>       Check after the lock if the page is still the same compound page.
>>> I can't speak to what the rules were ten years ago, but this is not
>>> true now.  Compound pages cannot be split if you hold a refcount.
>>> Since we don't track a per-page refcount, we wouldn't know which of
>>> the split pages to give the excess refcount to.
>> I noticed this recently
>>
>>   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
>>   * they are not mapped.
>>   *
>>   * Returns 0 if the hugepage is split successfully.
>>   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
>>   * us.
>>   */
>> int split_huge_page_to_list(struct page *page, struct list_head *list)
>> {
>>
>> I have a test case with poisoned shmem THP page that was mlocked and
>>
>> GUP pinned (FOLL_LONGTERM|FOLL_WRITE), but the split succeeded.
> Can you elaborate your test case a little bit more detail? There is a check in split_huge_page_to_list():
>
> /* Racy check whether the huge page can be split */
> bool can_split_folio(struct folio *folio, int *pextra_pins)
> {
> 	int extra_pins;
>
> 	/* Additional pins from page cache */
> 	if (folio_test_anon(folio))
> 		extra_pins = folio_test_swapcache(folio) ?
> 				folio_nr_pages(folio) : 0;
> 	else
> 		extra_pins = folio_nr_pages(folio);
> 	if (pextra_pins)
> 		*pextra_pins = extra_pins;
> 	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
> }
>
> So a large folio can only be split if only one extra page refcnt is held. It means large folio won't be split from
> under us if we hold an page refcnt. Or am I miss something?
My experiment was with an older kernel, though the can_split check is 
the same.
Also, I was emulating GUP pin with a hack:  in madvise_inject_error(), 
replaced
get_user_pages_fast(start, 1, 0, &page) with
pin_user_pages_fast(start, 1, FOLL_WRITE|FOLL_LONGTERM, &page)
I suspect something might be wrong with my hack, I'm trying to reproduce 
with real GUP pin and on a newer kernel.
Will keep you informed.
thanks!
-jane


>
> Thanks.
>
>> thanks,
>>
>> -jane
>>
>> .


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio
  2024-03-15 19:22               ` Jane Chu
@ 2024-03-18  2:28                 ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-03-18  2:28 UTC (permalink / raw)
  To: Jane Chu; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton, ak, Matthew Wilcox

On 2024/3/16 3:22, Jane Chu wrote:
> On 3/15/2024 1:32 AM, Miaohe Lin wrote:
> 
>> On 2024/3/13 9:23, Jane Chu wrote:
>>> On 3/12/2024 7:14 AM, Matthew Wilcox wrote:
>>>
>>>> On Tue, Mar 12, 2024 at 03:07:39PM +0800, Miaohe Lin wrote:
>>>>> On 2024/3/11 20:31, Matthew Wilcox wrote:
>>>>>> Assuming we have a refcount on this page so it can't be simultaneously
>>>>>> split/freed/whatever, these three sequences are equivalent:
>>>>> If page is stable after page refcnt is held, I agree below three sequences are equivalent.
>>>>>
>>>>>> 1    if (PageCompound(p))
>>>>>>
>>>>>> 2    struct page *head = compound_head(p);
>>>>>> 2    if (PageHead(head))
>>>>>>
>>>>>> 3    struct folio *folio = page_folio(p);
>>>>>> 3    if (folio_test_large(folio))
>>>>>>
>>>>>> .
>>>>>>
>>>>> But please see below commit:
>>>>>
>>>>> """
>>>>> commit f37d4298aa7f8b74395aa13c728677e2ed86fdaf
>>>>> Author: Andi Kleen <ak@linux.intel.com>
>>>>> Date:   Wed Aug 6 16:06:49 2014 -0700
>>>>>
>>>>>       hwpoison: fix race with changing page during offlining
>>>>>
>>>>>       When a hwpoison page is locked it could change state due to parallel
>>>>>       modifications.  The original compound page can be torn down and then
>>>>>       this 4k page becomes part of a differently-size compound page is is a
>>>>>       standalone regular page.
>>>>>
>>>>>       Check after the lock if the page is still the same compound page.
>>>> I can't speak to what the rules were ten years ago, but this is not
>>>> true now.  Compound pages cannot be split if you hold a refcount.
>>>> Since we don't track a per-page refcount, we wouldn't know which of
>>>> the split pages to give the excess refcount to.
>>> I noticed this recently
>>>
>>>   * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
>>>   * they are not mapped.
>>>   *
>>>   * Returns 0 if the hugepage is split successfully.
>>>   * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
>>>   * us.
>>>   */
>>> int split_huge_page_to_list(struct page *page, struct list_head *list)
>>> {
>>>
>>> I have a test case with poisoned shmem THP page that was mlocked and
>>>
>>> GUP pinned (FOLL_LONGTERM|FOLL_WRITE), but the split succeeded.
>> Can you elaborate your test case a little bit more detail? There is a check in split_huge_page_to_list():
>>
>> /* Racy check whether the huge page can be split */
>> bool can_split_folio(struct folio *folio, int *pextra_pins)
>> {
>>     int extra_pins;
>>
>>     /* Additional pins from page cache */
>>     if (folio_test_anon(folio))
>>         extra_pins = folio_test_swapcache(folio) ?
>>                 folio_nr_pages(folio) : 0;
>>     else
>>         extra_pins = folio_nr_pages(folio);
>>     if (pextra_pins)
>>         *pextra_pins = extra_pins;
>>     return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
>> }
>>
>> So a large folio can only be split if only one extra page refcnt is held. It means large folio won't be split from
>> under us if we hold an page refcnt. Or am I miss something?
> My experiment was with an older kernel, though the can_split check is the same.
> Also, I was emulating GUP pin with a hack:  in madvise_inject_error(), replaced
> get_user_pages_fast(start, 1, 0, &page) with
> pin_user_pages_fast(start, 1, FOLL_WRITE|FOLL_LONGTERM, &page)

IIUC, get_user_pages_fast() and pin_user_pages_fast(FOLL_LONGTERM) will both call try_grab_folio() to fetch extra page refcnt.
get_user_pages_fast() will have FOLL_GET set while pin_user_pages_fast() will have FOLL_PIN set. It seems they works same for
large folio about page refcnt.

 *
 *    FOLL_GET: folio's refcount will be incremented by @refs.
 *
 *    FOLL_PIN on large folios: folio's refcount will be incremented by
 *    @refs, and its pincount will be incremented by @refs.
 *
 *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
 *    @refs * GUP_PIN_COUNTING_BIAS.
 *
 * Return: The folio containing @page (with refcount appropriately
 * incremented) for success, or NULL upon failure. If neither FOLL_GET
 * nor FOLL_PIN was set, that's considered failure, and furthermore,
 * a likely bug in the caller, so a warning is also emitted.
 */
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)

They will both call try_get_folio(page, refs) to fetch the page refcnt. So your hack with emulating GUP pin seems
doesn't work as you expected. Or am I miss something?

Thanks.

> I suspect something might be wrong with my hack, I'm trying to reproduce with real GUP pin and on a newer kernel.
> Will keep you informed.
> thanks!
> -jane
> 
> 
>>
>> Thanks.
>>
>>> thanks,
>>>
>>> -jane
>>>
>>> .
> .



^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill
  2024-02-29 21:20 ` [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill Matthew Wilcox (Oracle)
  2024-03-04 12:09   ` Miaohe Lin
  2024-03-13  2:07   ` Jane Chu
@ 2024-03-19  0:36   ` Dan Williams
  2 siblings, 0 replies; 42+ messages in thread
From: Dan Williams @ 2024-03-19  0:36 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle), Andrew Morton
  Cc: Matthew Wilcox (Oracle),
	linux-mm, Miaohe Lin, Naoya Horiguchi, Dan Williams

Matthew Wilcox (Oracle) wrote:
> Unify the KSM and DAX codepaths by calculating the addr in
> add_to_kill_fsdax() instead of telling __add_to_kill() to calculate it.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: Dan Williams <dan.j.williams@intel.com>

Looks good, passes tests.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio()
  2024-03-06  9:31   ` Miaohe Lin
@ 2024-04-08 15:36     ` Matthew Wilcox
  2024-04-08 18:31       ` Jane Chu
  0 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2024-04-08 15:36 UTC (permalink / raw)
  To: Miaohe Lin; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On Wed, Mar 06, 2024 at 05:31:19PM +0800, Miaohe Lin wrote:
> > -void shake_page(struct page *p)
> > +void shake_folio(struct folio *folio)
> 
> It might not be a good idea to convert shake_page() into shake_folio(). shake_page() can
> be called without holding page refcnt. So the below race could happen:
> 
>  hwpoison_inject
>   folio = page_folio(p); --> assume p is 4k page [1]
>   shake_folio(folio)
>    folio_test_slab(folio)
>     test_bit(PG_slab, folio_flags(folio, FOLIO_PF_NO_TAIL) [2]
>      VM_BUG_ON_PGFLAGS(PageTail(page), page) in folio_flags
> 
> Between [1] and [2], page can become PageTail of a THP. So the VM_BUG_ON_PGFLAGS will trigger.
> Or am I miss something?

No, you're not missing anything.  This race can happen.  However, I've
removed the VM_BUG_ON for folio_test_slab() with "mm: free up PG_slab".
Now it goes through:

static inline bool PageSlab(const struct page *page)
{
        return folio_test_slab(page_folio(page));
}

static __always_inline bool folio_test_##fname(const struct folio *folio)\
{                                                                       \
        return folio_test_type(folio, PG_##lname);                      \
}                                                                       \

#define folio_test_type(folio, flag)                                    \
        ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)

which has no assertion that the folio is not a tail page.  Maybe it
should, but until then we'll silently get the wrong result ;-)



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio()
  2024-04-08 15:36     ` Matthew Wilcox
@ 2024-04-08 18:31       ` Jane Chu
  2024-04-10  4:01         ` Miaohe Lin
  0 siblings, 1 reply; 42+ messages in thread
From: Jane Chu @ 2024-04-08 18:31 UTC (permalink / raw)
  To: Matthew Wilcox, Miaohe Lin; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 4/8/2024 8:36 AM, Matthew Wilcox wrote:

> On Wed, Mar 06, 2024 at 05:31:19PM +0800, Miaohe Lin wrote:
>>> -void shake_page(struct page *p)
>>> +void shake_folio(struct folio *folio)
>> It might not be a good idea to convert shake_page() into shake_folio(). shake_page() can
>> be called without holding page refcnt. So the below race could happen:
>>
>>   hwpoison_inject
>>    folio = page_folio(p); --> assume p is 4k page [1]
>>    shake_folio(folio)
>>     folio_test_slab(folio)
>>      test_bit(PG_slab, folio_flags(folio, FOLIO_PF_NO_TAIL) [2]
>>       VM_BUG_ON_PGFLAGS(PageTail(page), page) in folio_flags
>>
>> Between [1] and [2], page can become PageTail of a THP. So the VM_BUG_ON_PGFLAGS will trigger.
>> Or am I miss something?
> No, you're not missing anything.  This race can happen.  However, I've
> removed the VM_BUG_ON for folio_test_slab() with "mm: free up PG_slab".
> Now it goes through:
>
> static inline bool PageSlab(const struct page *page)
> {
>          return folio_test_slab(page_folio(page));
> }
>
> static __always_inline bool folio_test_##fname(const struct folio *folio)\
> {                                                                       \
>          return folio_test_type(folio, PG_##lname);                      \
> }                                                                       \
>
> #define folio_test_type(folio, flag)                                    \
>          ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
>
> which has no assertion that the folio is not a tail page.  Maybe it
> should, but until then we'll silently get the wrong result ;-)
>
In this case (4k page -> THP), the act of drain will be triggered with 
perhaps nothing to do, so looks like a harmless 'wrong result' to me.

thanks,

-jane



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio()
  2024-04-08 18:31       ` Jane Chu
@ 2024-04-10  4:01         ` Miaohe Lin
  0 siblings, 0 replies; 42+ messages in thread
From: Miaohe Lin @ 2024-04-10  4:01 UTC (permalink / raw)
  To: Jane Chu, Matthew Wilcox; +Cc: linux-mm, Naoya Horiguchi, Andrew Morton

On 2024/4/9 2:31, Jane Chu wrote:
> On 4/8/2024 8:36 AM, Matthew Wilcox wrote:
> 
>> On Wed, Mar 06, 2024 at 05:31:19PM +0800, Miaohe Lin wrote:
>>>> -void shake_page(struct page *p)
>>>> +void shake_folio(struct folio *folio)
>>> It might not be a good idea to convert shake_page() into shake_folio(). shake_page() can
>>> be called without holding page refcnt. So the below race could happen:
>>>
>>>   hwpoison_inject
>>>    folio = page_folio(p); --> assume p is 4k page [1]
>>>    shake_folio(folio)
>>>     folio_test_slab(folio)
>>>      test_bit(PG_slab, folio_flags(folio, FOLIO_PF_NO_TAIL) [2]
>>>       VM_BUG_ON_PGFLAGS(PageTail(page), page) in folio_flags
>>>
>>> Between [1] and [2], page can become PageTail of a THP. So the VM_BUG_ON_PGFLAGS will trigger.
>>> Or am I miss something?
>> No, you're not missing anything.  This race can happen.  However, I've
>> removed the VM_BUG_ON for folio_test_slab() with "mm: free up PG_slab".
>> Now it goes through:
>>
>> static inline bool PageSlab(const struct page *page)
>> {
>>          return folio_test_slab(page_folio(page));
>> }
>>
>> static __always_inline bool folio_test_##fname(const struct folio *folio)\
>> {                                                                       \
>>          return folio_test_type(folio, PG_##lname);                      \
>> }                                                                       \
>>
>> #define folio_test_type(folio, flag)                                    \
>>          ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
>>
>> which has no assertion that the folio is not a tail page.  Maybe it
>> should, but until then we'll silently get the wrong result ;-)
>>
> In this case (4k page -> THP), the act of drain will be triggered with perhaps nothing to do, so looks like a harmless 'wrong result' to me.
> 
> thanks,

I tend to agree that with "free up PG_slab", this race should be harmless.
Thanks both.
.

> 
> -jane
> 
> .



^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2024-04-10  4:01 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-29 21:20 [PATCH 0/8] Some cleanups for memory-failure Matthew Wilcox (Oracle)
2024-02-29 21:20 ` [PATCH 1/8] mm/memory-failure: Remove fsdax_pgoff argument from __add_to_kill Matthew Wilcox (Oracle)
2024-03-04 12:09   ` Miaohe Lin
2024-03-13  2:07   ` Jane Chu
2024-03-13  3:23     ` Matthew Wilcox
2024-03-13 18:11       ` Jane Chu
2024-03-14  3:51         ` Matthew Wilcox
2024-03-14 17:54           ` Jane Chu
2024-03-19  0:36   ` Dan Williams
2024-02-29 21:20 ` [PATCH 2/8] mm/memory-failure: Pass addr to __add_to_kill() Matthew Wilcox (Oracle)
2024-03-04 12:10   ` Miaohe Lin
2024-02-29 21:20 ` [PATCH 3/8] mm: Return the address from page_mapped_in_vma() Matthew Wilcox (Oracle)
2024-03-04 12:31   ` Miaohe Lin
2024-03-05 20:09     ` Matthew Wilcox
2024-03-06  8:10       ` Miaohe Lin
2024-03-06  8:17   ` Miaohe Lin
2024-02-29 21:20 ` [PATCH 4/8] mm/memory-failure: Convert shake_page() to shake_folio() Matthew Wilcox (Oracle)
2024-03-06  9:31   ` Miaohe Lin
2024-04-08 15:36     ` Matthew Wilcox
2024-04-08 18:31       ` Jane Chu
2024-04-10  4:01         ` Miaohe Lin
2024-02-29 21:20 ` [PATCH 5/8] mm: Convert hugetlb_page_mapping_lock_write to folio Matthew Wilcox (Oracle)
2024-03-08  8:33   ` Miaohe Lin
2024-02-29 21:20 ` [PATCH 6/8] mm/memory-failure: Convert memory_failure() to use a folio Matthew Wilcox (Oracle)
2024-03-08  8:48   ` Miaohe Lin
2024-03-11 12:31     ` Matthew Wilcox
2024-03-12  7:07       ` Miaohe Lin
2024-03-12 14:14         ` Matthew Wilcox
2024-03-13  1:23           ` Jane Chu
2024-03-14  2:34             ` Miaohe Lin
2024-03-14 18:15               ` Jane Chu
2024-03-15  6:25                 ` Miaohe Lin
2024-03-15  8:32             ` Miaohe Lin
2024-03-15 19:22               ` Jane Chu
2024-03-18  2:28                 ` Miaohe Lin
2024-02-29 21:20 ` [PATCH 7/8] mm/memory-failure: Convert hwpoison_user_mappings to take " Matthew Wilcox (Oracle)
2024-03-11 11:44   ` Miaohe Lin
2024-02-29 21:20 ` [PATCH 8/8] mm/memory-failure: Add some folio conversions to unpoison_memory Matthew Wilcox (Oracle)
2024-03-11 11:29   ` Miaohe Lin
2024-03-01  6:28 ` [PATCH 0/8] Some cleanups for memory-failure Miaohe Lin
2024-03-01 12:40   ` Muhammad Usama Anjum
2024-03-04  1:55     ` Miaohe Lin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.