linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Make the memory failure blast radius more precise
@ 2020-06-23 20:17 Matthew Wilcox
  2020-06-23 21:48 ` Dan Williams
  2020-06-23 22:04 ` Luck, Tony
  0 siblings, 2 replies; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-23 20:17 UTC (permalink / raw)
  To: Tony Luck, Borislav Petkov, Naoya Horiguchi, linux-edac,
	linux-mm, linux-nvdimm, Darrick J. Wong, Jane Chu


Hardware actually tells us the blast radius of the error, but we ignore
it and take out the entire page.  We've had a customer request to know
exactly how much of the page is damaged so they can avoid reconstructing
an entire 2MB page if only a single cacheline is damaged.

This is only a strawman that I did in an hour or two; I'd appreciate
architectural-level feedback.  Should I just convert memory_failure() to
always take an address & granularity?  Should I create a struct to pass
around (page, phys, granularity) instead of reconstructing the missing
pieces in half a dozen functions?  Is this functionality welcome at all,
or is the risk of upsetting applications which expect at least a page
of granularity too high?

I can see places where I've specified a plain PAGE_SHIFT insted of
interrogating a compound page for its size.  I'd probably split this
patch up into two or three pieces for applying.

I've also blindly taken out the call to unmap_mapping_range().  Again,
the customer requested that we not do this.  That deserves to be in its
own patch and properly justified.

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ce9120c4f740..09978c5b9493 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -463,7 +463,7 @@ static void mce_irq_work_cb(struct irq_work *entry)
  * Check if the address reported by the CPU is in a format we can parse.
  * It would be possible to add code for most other cases, but all would
  * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
+ * parser). So only support physical addresses up to page granularity for now.
  */
 int mce_usable_address(struct mce *m)
 {
@@ -570,7 +570,6 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
 			      void *data)
 {
 	struct mce *mce = (struct mce *)data;
-	unsigned long pfn;
 
 	if (!mce || !mce_usable_address(mce))
 		return NOTIFY_DONE;
@@ -579,9 +578,8 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
 	    mce->severity != MCE_DEFERRED_SEVERITY)
 		return NOTIFY_DONE;
 
-	pfn = mce->addr >> PAGE_SHIFT;
-	if (!memory_failure(pfn, 0)) {
-		set_mce_nospec(pfn, whole_page(mce));
+	if (!__memory_failure(mce->addr, MCI_MISC_ADDR_LSB(mce->misc), 0)) {
+		set_mce_nospec(mce->addr >> PAGE_SHIFT, whole_page(mce));
 		mce->kflags |= MCE_HANDLED_UC;
 	}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc7b87310c10..f2ea6491df28 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3035,7 +3035,14 @@ enum mf_flags {
 	MF_MUST_KILL = 1 << 2,
 	MF_SOFT_OFFLINE = 1 << 3,
 };
-extern int memory_failure(unsigned long pfn, int flags);
+
+int __memory_failure(unsigned long addr, unsigned int granularity, int flags);
+
+static inline int memory_failure(unsigned long pfn, int flags)
+{
+	return __memory_failure(pfn << PAGE_SHIFT, PAGE_SHIFT, flags);
+}
+
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b8ccb1fb9b..775b053dcd98 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -198,7 +198,7 @@ struct to_kill {
 	struct list_head nd;
 	struct task_struct *tsk;
 	unsigned long addr;
-	short size_shift;
+	short granularity;
 };
 
 /*
@@ -209,7 +209,7 @@ struct to_kill {
 static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
 {
 	struct task_struct *t = tk->tsk;
-	short addr_lsb = tk->size_shift;
+	short addr_lsb = tk->granularity;
 	int ret = 0;
 
 	pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
@@ -262,40 +262,6 @@ void shake_page(struct page *p, int access)
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-		struct vm_area_struct *vma)
-{
-	unsigned long address = vma_address(page, vma);
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = pgd_offset(vma->vm_mm, address);
-	if (!pgd_present(*pgd))
-		return 0;
-	p4d = p4d_offset(pgd, address);
-	if (!p4d_present(*p4d))
-		return 0;
-	pud = pud_offset(p4d, address);
-	if (!pud_present(*pud))
-		return 0;
-	if (pud_devmap(*pud))
-		return PUD_SHIFT;
-	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
-		return 0;
-	if (pmd_devmap(*pmd))
-		return PMD_SHIFT;
-	pte = pte_offset_map(pmd, address);
-	if (!pte_present(*pte))
-		return 0;
-	if (pte_devmap(*pte))
-		return PAGE_SHIFT;
-	return 0;
-}
-
 /*
  * Failure handling: if we can't find or can't kill a process there's
  * not much we can do.	We just print a message and ignore otherwise.
@@ -306,8 +272,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-		       struct vm_area_struct *vma,
-		       struct list_head *to_kill)
+			unsigned int offset, unsigned int granularity,
+			struct vm_area_struct *vma, struct list_head *to_kill)
 {
 	struct to_kill *tk;
 
@@ -318,27 +284,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 	}
 
 	tk->addr = page_address_in_vma(p, vma);
-	if (is_zone_device_page(p))
-		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-	else
-		tk->size_shift = page_shift(compound_head(p));
+	tk->granularity = granularity;
 
-	/*
-	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
-	 * "tk->size_shift" is always non-zero for !is_zone_device_page(),
-	 * so "tk->size_shift == 0" effectively checks no mapping on
-	 * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
-	 * to a process' address space, it's possible not all N VMAs
-	 * contain mappings for the page, but at least one VMA does.
-	 * Only deliver SIGBUS with payload derived from the VMA that
-	 * has a mapping for the page.
-	 */
+	/* Send SIGKILL if "tk->addr == -EFAULT". */
 	if (tk->addr == -EFAULT) {
 		pr_info("Memory failure: Unable to find user space address %lx in %s\n",
 			page_to_pfn(p), tsk->comm);
-	} else if (tk->size_shift == 0) {
-		kfree(tk);
-		return;
+	} else {
+		tk->addr += offset;
 	}
 
 	get_task_struct(tsk);
@@ -468,7 +421,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			if (!page_mapped_in_vma(page, vma))
 				continue;
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, 0, PAGE_SHIFT, vma,
+						to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -478,8 +432,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 /*
  * Collect processes when the error hit a file mapped page.
  */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
-				int force_early)
+static void collect_procs_file(struct page *page, unsigned int offset,
+		short granularity, struct list_head *to_kill, int force_early)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
@@ -503,7 +457,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 			 * to be informed of all such data corruptions.
 			 */
 			if (vma->vm_mm == t->mm)
-				add_to_kill(t, page, vma, to_kill);
+				add_to_kill(t, page, 0, PAGE_SHIFT, vma,
+						to_kill);
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -513,8 +468,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  */
-static void collect_procs(struct page *page, struct list_head *tokill,
-				int force_early)
+static void collect_procs(struct page *page, unsigned int offset,
+		short granularity, struct list_head *tokill, int force_early)
 {
 	if (!page->mapping)
 		return;
@@ -522,7 +477,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	if (PageAnon(page))
 		collect_procs_anon(page, tokill, force_early);
 	else
-		collect_procs_file(page, tokill, force_early);
+		collect_procs_file(page, offset, granularity, tokill,
+				force_early);
 }
 
 static const char *action_name[] = {
@@ -1026,7 +982,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * there's nothing that can be done.
 	 */
 	if (kill)
-		collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+		collect_procs(hpage, 0, PAGE_SHIFT, &tokill,
+				flags & MF_ACTION_REQUIRED);
 
 	if (!PageHuge(hpage)) {
 		unmap_success = try_to_unmap(hpage, ttu);
@@ -1176,16 +1133,14 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	return res;
 }
 
-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
-		struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap(unsigned long addr, short granularity,
+		int flags, struct dev_pagemap *pgmap)
 {
+	unsigned long pfn = addr >> PAGE_SHIFT;
 	struct page *page = pfn_to_page(pfn);
 	const bool unmap_success = true;
-	unsigned long size = 0;
-	struct to_kill *tk;
 	LIST_HEAD(tokill);
 	int rc = -EBUSY;
-	loff_t start;
 	dax_entry_t cookie;
 
 	/*
@@ -1225,21 +1180,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	 * SIGBUS (i.e. MF_MUST_KILL)
 	 */
 	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-	collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
-
-	list_for_each_entry(tk, &tokill, nd)
-		if (tk->size_shift)
-			size = max(size, 1UL << tk->size_shift);
-	if (size) {
-		/*
-		 * Unmap the largest mapping to avoid breaking up
-		 * device-dax mappings which are constant size. The
-		 * actual size of the mapping being torn down is
-		 * communicated in siginfo, see kill_proc()
-		 */
-		start = (page->index << PAGE_SHIFT) & ~(size - 1);
-		unmap_mapping_range(page->mapping, start, start + size, 0);
-	}
+	collect_procs(page, offset_in_page(addr), granularity, &tokill,
+			flags & MF_ACTION_REQUIRED);
 	kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
 	rc = 0;
 unlock:
@@ -1252,8 +1194,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 }
 
 /**
- * memory_failure - Handle memory failure of a page.
- * @pfn: Page Number of the corrupted page
+ * __memory_failure - Handle memory failure.
+ * @addr: Physical address of the error.
+ * @granularity: Number of bits in the address which are bad.
  * @flags: fine tune action taken
  *
  * This function is called by the low level machine check code
@@ -1268,8 +1211,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
  * Must run in process context (e.g. a work queue) with interrupts
  * enabled and no spinlocks hold.
  */
-int memory_failure(unsigned long pfn, int flags)
+int __memory_failure(unsigned long addr, unsigned int granularity, int flags)
 {
+	unsigned long pfn = addr >> PAGE_SHIFT;
 	struct page *p;
 	struct page *hpage;
 	struct page *orig_head;
@@ -1285,8 +1229,8 @@ int memory_failure(unsigned long pfn, int flags)
 		if (pfn_valid(pfn)) {
 			pgmap = get_dev_pagemap(pfn, NULL);
 			if (pgmap)
-				return memory_failure_dev_pagemap(pfn, flags,
-								  pgmap);
+				return memory_failure_dev_pagemap(addr,
+						granularity, flags, pgmap);
 		}
 		pr_err("Memory failure: %#lx: memory outside kernel control\n",
 			pfn);
@@ -1442,7 +1386,7 @@ int memory_failure(unsigned long pfn, int flags)
 	unlock_page(p);
 	return res;
 }
-EXPORT_SYMBOL_GPL(memory_failure);
+EXPORT_SYMBOL_GPL(__memory_failure);
 
 #define MEMORY_FAILURE_FIFO_ORDER	4
 #define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER)

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 20:17 [RFC] Make the memory failure blast radius more precise Matthew Wilcox
@ 2020-06-23 21:48 ` Dan Williams
  2020-06-23 22:04 ` Luck, Tony
  1 sibling, 0 replies; 18+ messages in thread
From: Dan Williams @ 2020-06-23 21:48 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Tony Luck, Borislav Petkov, Naoya Horiguchi, linux-edac,
	Linux MM, linux-nvdimm, Darrick J. Wong, Jane Chu, david

On Tue, Jun 23, 2020 at 1:18 PM Matthew Wilcox <willy@infradead.org> wrote:
>
>
> Hardware actually tells us the blast radius of the error, but we ignore
> it and take out the entire page.  We've had a customer request to know
> exactly how much of the page is damaged so they can avoid reconstructing
> an entire 2MB page if only a single cacheline is damaged.
>
> This is only a strawman that I did in an hour or two; I'd appreciate
> architectural-level feedback.  Should I just convert memory_failure() to
> always take an address & granularity?  Should I create a struct to pass
> around (page, phys, granularity) instead of reconstructing the missing
> pieces in half a dozen functions?  Is this functionality welcome at all,
> or is the risk of upsetting applications which expect at least a page
> of granularity too high?
>
> I can see places where I've specified a plain PAGE_SHIFT insted of
> interrogating a compound page for its size.  I'd probably split this
> patch up into two or three pieces for applying.
>
> I've also blindly taken out the call to unmap_mapping_range().  Again,
> the customer requested that we not do this.  That deserves to be in its
> own patch and properly justified.

I had been thinking that we could not do much with the legacy
memory-failure reporting model and that applications that want a new
model would need to opt-into it. This topic also dovetails with what
Dave and I had been discussing in terms coordinating memory error
handling with the filesystem which may have more information about
multiple mappings of a DAX page (reflink) [1].

[1]: http://lore.kernel.org/r/20200311063942.GE10776@dread.disaster.area

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 20:17 [RFC] Make the memory failure blast radius more precise Matthew Wilcox
  2020-06-23 21:48 ` Dan Williams
@ 2020-06-23 22:04 ` Luck, Tony
  2020-06-23 22:17   ` Matthew Wilcox
  2020-06-24  4:32   ` David Rientjes
  1 sibling, 2 replies; 18+ messages in thread
From: Luck, Tony @ 2020-06-23 22:04 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Borislav Petkov, Naoya Horiguchi, linux-edac, linux-mm,
	linux-nvdimm, Darrick J. Wong, Jane Chu

On Tue, Jun 23, 2020 at 09:17:45PM +0100, Matthew Wilcox wrote:
> 
> Hardware actually tells us the blast radius of the error, but we ignore
> it and take out the entire page.  We've had a customer request to know
> exactly how much of the page is damaged so they can avoid reconstructing
> an entire 2MB page if only a single cacheline is damaged.
> 
> This is only a strawman that I did in an hour or two; I'd appreciate
> architectural-level feedback.  Should I just convert memory_failure() to
> always take an address & granularity?  Should I create a struct to pass
> around (page, phys, granularity) instead of reconstructing the missing
> pieces in half a dozen functions?  Is this functionality welcome at all,
> or is the risk of upsetting applications which expect at least a page
> of granularity too high?

What is the interface to these applications that want finer granularity?

Current code does very poorly with hugetlbfs pages ... user loses the
whole 2 MB or 1GB. That's just silly (though I've been told that it is
hard to fix because allowing a hugetlbfs page to be broken up at an arbitrary
time as the result of a mahcine check means that the kernel needs locking
around a bunch of fas paths that currently assume that a huge page will
stay being a huge page).

For sub-4K page usage, there are different problems. We can't leave the
original page with the poisoned cache line mapped to the user as they may
just access the poison data and trigger another machine check. But if we
map in some different page with all the good bits copied, the user needs
to be aware which parts of the page no longer have their data.

-Tony

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 22:04 ` Luck, Tony
@ 2020-06-23 22:17   ` Matthew Wilcox
  2020-06-23 22:26     ` Luck, Tony
  2020-06-24  4:32   ` David Rientjes
  1 sibling, 1 reply; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-23 22:17 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Naoya Horiguchi, linux-edac, linux-mm,
	linux-nvdimm, Darrick J. Wong, Jane Chu

On Tue, Jun 23, 2020 at 03:04:12PM -0700, Luck, Tony wrote:
> On Tue, Jun 23, 2020 at 09:17:45PM +0100, Matthew Wilcox wrote:
> > Hardware actually tells us the blast radius of the error, but we ignore
> > it and take out the entire page.  We've had a customer request to know
> > exactly how much of the page is damaged so they can avoid reconstructing
> > an entire 2MB page if only a single cacheline is damaged.
> > 
> > This is only a strawman that I did in an hour or two; I'd appreciate
> > architectural-level feedback.  Should I just convert memory_failure() to
> > always take an address & granularity?  Should I create a struct to pass
> > around (page, phys, granularity) instead of reconstructing the missing
> > pieces in half a dozen functions?  Is this functionality welcome at all,
> > or is the risk of upsetting applications which expect at least a page
> > of granularity too high?
> 
> What is the interface to these applications that want finer granularity?
> 
> Current code does very poorly with hugetlbfs pages ... user loses the
> whole 2 MB or 1GB. That's just silly (though I've been told that it is
> hard to fix because allowing a hugetlbfs page to be broken up at an arbitrary
> time as the result of a mahcine check means that the kernel needs locking
> around a bunch of fas paths that currently assume that a huge page will
> stay being a huge page).
> 
> For sub-4K page usage, there are different problems. We can't leave the
> original page with the poisoned cache line mapped to the user as they may
> just access the poison data and trigger another machine check. But if we
> map in some different page with all the good bits copied, the user needs
> to be aware which parts of the page no longer have their data.

This is specifically for DAX.  The interface we were hoping to use to
fix the error is to leave the mapping in place and call

	fallocate(fd, FALLOC_FL_ZERO_RANGE, offset, len);

The application will then take care of writing actual data that isn't
zeroes to the file.  Or leave it as zeroes if that's what the application
wants to be its error indicator.

The fallocate path doesn't work quite right today, but there's no point
in trying to fix that up if we can't sort out the delivery of the actual
error range.

It might also be nice to have an madvise() MADV_ZERO option so the
application doesn't have to look up the fd associated with that memory
range, but we haven't floated that idea with the customer yet; I just
thought of it now.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 22:17   ` Matthew Wilcox
@ 2020-06-23 22:26     ` Luck, Tony
  2020-06-23 22:40       ` Matthew Wilcox
  0 siblings, 1 reply; 18+ messages in thread
From: Luck, Tony @ 2020-06-23 22:26 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Borislav Petkov, Naoya Horiguchi, linux-edac, linux-mm,
	linux-nvdimm, Darrick J. Wong, Jane Chu

On Tue, Jun 23, 2020 at 11:17:41PM +0100, Matthew Wilcox wrote:
> It might also be nice to have an madvise() MADV_ZERO option so the
> application doesn't have to look up the fd associated with that memory
> range, but we haven't floated that idea with the customer yet; I just
> thought of it now.

So the conversation between OS and kernel goes like this?

1) machine check
2) Kernel unmaps the 4K page surroundinng the poison and sends
   SIGBUS to the application to say that one cache line is gone
3) App says madvise(MADV_ZERO, that cache line)
4) Kernel says ... "oh, you know how to deal with this" and allocates
   a new page, copying the 63 good cache lines from the old page and
   zeroing the missing one. New page is mapped to user.

Do you have folks lined up to use that?  I don't know that many
folks are even catching the SIGBUS :-(

-Tony

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 22:26     ` Luck, Tony
@ 2020-06-23 22:40       ` Matthew Wilcox
  2020-06-24  0:01         ` Darrick J. Wong
  2020-06-24 21:22         ` Jane Chu
  0 siblings, 2 replies; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-23 22:40 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, Naoya Horiguchi, linux-edac, linux-mm,
	linux-nvdimm, Darrick J. Wong, Jane Chu

On Tue, Jun 23, 2020 at 03:26:58PM -0700, Luck, Tony wrote:
> On Tue, Jun 23, 2020 at 11:17:41PM +0100, Matthew Wilcox wrote:
> > It might also be nice to have an madvise() MADV_ZERO option so the
> > application doesn't have to look up the fd associated with that memory
> > range, but we haven't floated that idea with the customer yet; I just
> > thought of it now.
> 
> So the conversation between OS and kernel goes like this?
> 
> 1) machine check
> 2) Kernel unmaps the 4K page surroundinng the poison and sends
>    SIGBUS to the application to say that one cache line is gone
> 3) App says madvise(MADV_ZERO, that cache line)
> 4) Kernel says ... "oh, you know how to deal with this" and allocates
>    a new page, copying the 63 good cache lines from the old page and
>    zeroing the missing one. New page is mapped to user.

That could be one way of implementing it.  My understanding is that
pmem devices will reallocate bad cachelines on writes, so a better
implementation would be:

1) Kernel receives machine check
2) Kernel sends SIGBUS to the application
3) App send madvise(MADV_ZERO, addr, 1 << granularity)
4) Kernel does special writes to ensure the cacheline is zeroed
5) App does whatever it needs to recover (reconstructs the data or marks
it as gone)

> Do you have folks lined up to use that?  I don't know that many
> folks are even catching the SIGBUS :-(

Had a 75 minute meeting with some people who want to use pmem this
afternoon ...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 22:40       ` Matthew Wilcox
@ 2020-06-24  0:01         ` Darrick J. Wong
  2020-06-24 12:10           ` Matthew Wilcox
  2020-06-24 21:22         ` Jane Chu
  1 sibling, 1 reply; 18+ messages in thread
From: Darrick J. Wong @ 2020-06-24  0:01 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Luck, Tony, Borislav Petkov, Naoya Horiguchi, linux-edac,
	linux-mm, linux-nvdimm, Jane Chu

On Tue, Jun 23, 2020 at 11:40:27PM +0100, Matthew Wilcox wrote:
> On Tue, Jun 23, 2020 at 03:26:58PM -0700, Luck, Tony wrote:
> > On Tue, Jun 23, 2020 at 11:17:41PM +0100, Matthew Wilcox wrote:
> > > It might also be nice to have an madvise() MADV_ZERO option so the
> > > application doesn't have to look up the fd associated with that memory
> > > range, but we haven't floated that idea with the customer yet; I just
> > > thought of it now.
> > 
> > So the conversation between OS and kernel goes like this?
> > 
> > 1) machine check
> > 2) Kernel unmaps the 4K page surroundinng the poison and sends
> >    SIGBUS to the application to say that one cache line is gone
> > 3) App says madvise(MADV_ZERO, that cache line)
> > 4) Kernel says ... "oh, you know how to deal with this" and allocates
> >    a new page, copying the 63 good cache lines from the old page and
> >    zeroing the missing one. New page is mapped to user.
> 
> That could be one way of implementing it.  My understanding is that
> pmem devices will reallocate bad cachelines on writes, so a better
> implementation would be:
> 
> 1) Kernel receives machine check
> 2) Kernel sends SIGBUS to the application
> 3) App send madvise(MADV_ZERO, addr, 1 << granularity)
> 4) Kernel does special writes to ensure the cacheline is zeroed
> 5) App does whatever it needs to recover (reconstructs the data or marks
> it as gone)

Frankly, I've wondered why the filesystem shouldn't just be in charge of
all this--

1. kernel receives machine check
2. kernel tattles to xfs
3. xfs looks up which file(s) own the pmem range
4. xfs zeroes the region, clears the poison, and sets AS_EIO on the
   files
5. xfs sends SIGBUS to any programs that had those files mapped to tell
   them "Your data is gone, we've stabilized the storage you had
   mapped."
6. app does whatever it needs to recover

Apps shouldn't have to do this punch-and-reallocate dance, seeing as
they don't currently do that for SCSI disks and the like.

--D

> > Do you have folks lined up to use that?  I don't know that many
> > folks are even catching the SIGBUS :-(
> 
> Had a 75 minute meeting with some people who want to use pmem this
> afternoon ...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 22:04 ` Luck, Tony
  2020-06-23 22:17   ` Matthew Wilcox
@ 2020-06-24  4:32   ` David Rientjes
  2020-06-24 20:57     ` Jane Chu
  2020-06-25  2:16     ` HORIGUCHI NAOYA(堀口 直也)
  1 sibling, 2 replies; 18+ messages in thread
From: David Rientjes @ 2020-06-24  4:32 UTC (permalink / raw)
  To: Luck, Tony, Mike Kravetz, Dr. David Alan Gilbert, Peter Xu,
	Andrea Arcangeli
  Cc: Matthew Wilcox, Borislav Petkov, Naoya Horiguchi, linux-edac,
	linux-mm, linux-nvdimm, Darrick J. Wong, Jane Chu

On Tue, 23 Jun 2020, Luck, Tony wrote:

> > Hardware actually tells us the blast radius of the error, but we ignore
> > it and take out the entire page.  We've had a customer request to know
> > exactly how much of the page is damaged so they can avoid reconstructing
> > an entire 2MB page if only a single cacheline is damaged.
> > 
> > This is only a strawman that I did in an hour or two; I'd appreciate
> > architectural-level feedback.  Should I just convert memory_failure() to
> > always take an address & granularity?  Should I create a struct to pass
> > around (page, phys, granularity) instead of reconstructing the missing
> > pieces in half a dozen functions?  Is this functionality welcome at all,
> > or is the risk of upsetting applications which expect at least a page
> > of granularity too high?
> 
> What is the interface to these applications that want finer granularity?
> 
> Current code does very poorly with hugetlbfs pages ... user loses the
> whole 2 MB or 1GB. That's just silly (though I've been told that it is
> hard to fix because allowing a hugetlbfs page to be broken up at an arbitrary
> time as the result of a mahcine check means that the kernel needs locking
> around a bunch of fas paths that currently assume that a huge page will
> stay being a huge page).
> 

Thanks for bringing this up, Tony.  Mike Kravetz pointed me to this thread 
(thanks Mike!) so let's add him in explicitly as well as Andrea, Peter, 
and David from Red Hat who we've been discussing an idea with that may 
introduce exactly this needed support but for different purposes :)  The 
timing of this thread is _uncanny_.

To improve the performance of userfaultfd for the purposes of post-copy 
live migration we need to reduce the granularity in which pages are 
migrated; we're looking at this from a 1GB gigantic page perspective but 
the same arguments can likely be had for 2MB hugepages as well.  1GB pages 
are too much of a bottleneck and, as you bring up, 1GB is simply too much 
memory to poison :)  We don't have 1GB thp support so the big idea was to 
introduce thp-like DoubleMap support into hugetlbfs for the purposes of 
post-copy live migration and then I had the idea that this could be 
extended to memory failure as well.

(We don't see the lack of 1GB thp here as a deficiency for anything other 
than these two issues, hugetlb provides strong guarantees.)

I don't want to hijack Matthew's thread which is primarily about DAX, but 
did get intrigued by your concerns about hugetlbfs page poisoning.  We can 
fork the thread off here to discuss only the hugetlb application of this 
if it makes sense to you or you'd like to collaborate on it as well.

The DoubleMap support would allow us to map the 1GB gigantic pages with 
the PUD and the PMDs as well (and, further, the 2MB hugepages with the PMD 
and PTEs) so that we can copy fragments into PMDs or PTEs and we don't 
need to migrate the entire gigantic page.  Any access triggers #PF through 
hugetlb_no_page() -> handle_userfault() which would trigger another 
UFFDIO_COPY and map another fragment.

Assume a world where this DoubleMap support already exists for hugetlb 
pages today and all the invariants including page migration are fixed up 
(since a PTE can now map a hugetlb page and a PMD can now map a gigantic 
hugetlb page).  It *seems* like we'd be able to reduce the blast radius 
here too on a hard memory failure: dissolve the gigantic page in place, 
SIGBUS/SIGKILL on the bad PMD or PTE, and avoid poisoning the head of the 
hugetlb page.  We agree that poisoning this large amount of memory is not 
ideal :)

Anyway, this was some brainstorming that I was doing with Mike and the 
others based on the idea of using DoubleMap support for post-copy live 
migration.  If you would be interested or would like to collaborate on 
it, we'd love to talk.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-24  0:01         ` Darrick J. Wong
@ 2020-06-24 12:10           ` Matthew Wilcox
  2020-06-24 23:21             ` Dan Williams
  0 siblings, 1 reply; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-24 12:10 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Luck, Tony, Borislav Petkov, Naoya Horiguchi, linux-edac,
	linux-mm, linux-nvdimm, Jane Chu

On Tue, Jun 23, 2020 at 05:01:24PM -0700, Darrick J. Wong wrote:
> On Tue, Jun 23, 2020 at 11:40:27PM +0100, Matthew Wilcox wrote:
> > On Tue, Jun 23, 2020 at 03:26:58PM -0700, Luck, Tony wrote:
> > > On Tue, Jun 23, 2020 at 11:17:41PM +0100, Matthew Wilcox wrote:
> > > > It might also be nice to have an madvise() MADV_ZERO option so the
> > > > application doesn't have to look up the fd associated with that memory
> > > > range, but we haven't floated that idea with the customer yet; I just
> > > > thought of it now.
> > > 
> > > So the conversation between OS and kernel goes like this?
> > > 
> > > 1) machine check
> > > 2) Kernel unmaps the 4K page surroundinng the poison and sends
> > >    SIGBUS to the application to say that one cache line is gone
> > > 3) App says madvise(MADV_ZERO, that cache line)
> > > 4) Kernel says ... "oh, you know how to deal with this" and allocates
> > >    a new page, copying the 63 good cache lines from the old page and
> > >    zeroing the missing one. New page is mapped to user.
> > 
> > That could be one way of implementing it.  My understanding is that
> > pmem devices will reallocate bad cachelines on writes, so a better
> > implementation would be:
> > 
> > 1) Kernel receives machine check
> > 2) Kernel sends SIGBUS to the application
> > 3) App send madvise(MADV_ZERO, addr, 1 << granularity)
> > 4) Kernel does special writes to ensure the cacheline is zeroed
> > 5) App does whatever it needs to recover (reconstructs the data or marks
> > it as gone)
> 
> Frankly, I've wondered why the filesystem shouldn't just be in charge of
> all this--
> 
> 1. kernel receives machine check
> 2. kernel tattles to xfs
> 3. xfs looks up which file(s) own the pmem range
> 4. xfs zeroes the region, clears the poison, and sets AS_EIO on the
>    files

... machine reboots, app restarts, gets no notification anything is wrong,
treats zeroed region as good data, launches nuclear missiles.

> 5. xfs sends SIGBUS to any programs that had those files mapped to tell
>    them "Your data is gone, we've stabilized the storage you had
>    mapped."
> 6. app does whatever it needs to recover
> 
> Apps shouldn't have to do this punch-and-reallocate dance, seeing as
> they don't currently do that for SCSI disks and the like.

The SCSI disk retains the error until the sector is rewritten.
I'm not entirely sure whether you're trying to draw an analogy with
error-in-page-cache or error-on-storage-medium.

error-on-medium needs to persist until the app takes an affirmative step
to clear it.  I presume XFS does not write zeroes to sectors with
errors on SCSI disks ...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-24  4:32   ` David Rientjes
@ 2020-06-24 20:57     ` Jane Chu
  2020-06-24 22:01       ` David Rientjes
  2020-06-25  2:16     ` HORIGUCHI NAOYA(堀口 直也)
  1 sibling, 1 reply; 18+ messages in thread
From: Jane Chu @ 2020-06-24 20:57 UTC (permalink / raw)
  To: David Rientjes, Luck, Tony, Mike Kravetz, Dr. David Alan Gilbert,
	Peter Xu, Andrea Arcangeli
  Cc: Matthew Wilcox, Borislav Petkov, Naoya Horiguchi, linux-edac,
	linux-mm, linux-nvdimm, Darrick J. Wong

Hi, David,

On 6/23/2020 9:32 PM, David Rientjes wrote:
> I don't want to hijack Matthew's thread which is primarily about DAX, but
> did get intrigued by your concerns about hugetlbfs page poisoning.  We can
> fork the thread off here to discuss only the hugetlb application of this
> if it makes sense to you or you'd like to collaborate on it as well.

hugetlbfs is not supported in DAX, are you planning to add support?

thanks,
-jane

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-23 22:40       ` Matthew Wilcox
  2020-06-24  0:01         ` Darrick J. Wong
@ 2020-06-24 21:22         ` Jane Chu
  2020-06-25  0:13           ` Luck, Tony
  1 sibling, 1 reply; 18+ messages in thread
From: Jane Chu @ 2020-06-24 21:22 UTC (permalink / raw)
  To: Matthew Wilcox, Luck, Tony
  Cc: Borislav Petkov, Naoya Horiguchi, linux-edac, linux-mm,
	linux-nvdimm, Darrick J. Wong, David Rientjes, Mike Kravetz,
	Dr. David Alan Gilbert, Peter Xu, Andrea Arcangeli

On 6/23/2020 3:40 PM, Matthew Wilcox wrote:
> On Tue, Jun 23, 2020 at 03:26:58PM -0700, Luck, Tony wrote:
>> On Tue, Jun 23, 2020 at 11:17:41PM +0100, Matthew Wilcox wrote:
>>> It might also be nice to have an madvise() MADV_ZERO option so the
>>> application doesn't have to look up the fd associated with that memory
>>> range, but we haven't floated that idea with the customer yet; I just
>>> thought of it now.
>>
>> So the conversation between OS and kernel goes like this?
>>
>> 1) machine check
>> 2) Kernel unmaps the 4K page surroundinng the poison and sends
>>     SIGBUS to the application to say that one cache line is gone
>> 3) App says madvise(MADV_ZERO, that cache line)
>> 4) Kernel says ... "oh, you know how to deal with this" and allocates
>>     a new page, copying the 63 good cache lines from the old page and
>>     zeroing the missing one. New page is mapped to user.
> 
> That could be one way of implementing it.  My understanding is that
> pmem devices will reallocate bad cachelines on writes, so a better
> implementation would be:
> 
> 1) Kernel receives machine check
> 2) Kernel sends SIGBUS to the application
> 3) App send madvise(MADV_ZERO, addr, 1 << granularity)
> 4) Kernel does special writes to ensure the cacheline is zeroed
> 5) App does whatever it needs to recover (reconstructs the data or marks
> it as gone)

Thanks Matthew!

Both the RFC patch and the above 5-step recovery plan look neat, step 4) 
is nice to carry forward on icelake when a single instruction to clear
poison is available.

Next, what are the preferred ways to deal with the signal handling race 
when multiple processes are sharing the poisoned pmem page?

Also, is it advisable to application to ignore SIGBUS with MCEERR_AO?

> 
>> Do you have folks lined up to use that?  I don't know that many
>> folks are even catching the SIGBUS :-(
> 
> Had a 75 minute meeting with some people who want to use pmem this
> afternoon ...
>
thanks!
-jane

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-24 20:57     ` Jane Chu
@ 2020-06-24 22:01       ` David Rientjes
  0 siblings, 0 replies; 18+ messages in thread
From: David Rientjes @ 2020-06-24 22:01 UTC (permalink / raw)
  To: Jane Chu
  Cc: Luck, Tony, Mike Kravetz, Dr. David Alan Gilbert, Peter Xu,
	Andrea Arcangeli, Matthew Wilcox, Borislav Petkov,
	Naoya Horiguchi, linux-edac, linux-mm, linux-nvdimm,
	Darrick J. Wong

On Wed, 24 Jun 2020, Jane Chu wrote:

> Hi, David,
> 
> On 6/23/2020 9:32 PM, David Rientjes wrote:
> > I don't want to hijack Matthew's thread which is primarily about DAX, but
> > did get intrigued by your concerns about hugetlbfs page poisoning.  We can
> > fork the thread off here to discuss only the hugetlb application of this
> > if it makes sense to you or you'd like to collaborate on it as well.
> 
> hugetlbfs is not supported in DAX, are you planning to add support?
> 

No, sorry, I was replying only about Tony's comment about the current page 
poisoning behavior for hugetlbfs and brainstorming a way that we could 
reduce the memory failure blast radius for hugetlb pages if we had the 
doublemap support that would also be useful for post-copy live migration.

It's independent of DAX so I'm happy to fork this topic off into a 
different thread for the hugetlb discussion.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-24 12:10           ` Matthew Wilcox
@ 2020-06-24 23:21             ` Dan Williams
  2020-06-25  0:17               ` Matthew Wilcox
  0 siblings, 1 reply; 18+ messages in thread
From: Dan Williams @ 2020-06-24 23:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Darrick J. Wong, Luck, Tony, Borislav Petkov, Naoya Horiguchi,
	linux-edac, Linux MM, linux-nvdimm

On Wed, Jun 24, 2020 at 5:10 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Tue, Jun 23, 2020 at 05:01:24PM -0700, Darrick J. Wong wrote:
> > On Tue, Jun 23, 2020 at 11:40:27PM +0100, Matthew Wilcox wrote:
> > > On Tue, Jun 23, 2020 at 03:26:58PM -0700, Luck, Tony wrote:
> > > > On Tue, Jun 23, 2020 at 11:17:41PM +0100, Matthew Wilcox wrote:
> > > > > It might also be nice to have an madvise() MADV_ZERO option so the
> > > > > application doesn't have to look up the fd associated with that memory
> > > > > range, but we haven't floated that idea with the customer yet; I just
> > > > > thought of it now.
> > > >
> > > > So the conversation between OS and kernel goes like this?
> > > >
> > > > 1) machine check
> > > > 2) Kernel unmaps the 4K page surroundinng the poison and sends
> > > >    SIGBUS to the application to say that one cache line is gone
> > > > 3) App says madvise(MADV_ZERO, that cache line)
> > > > 4) Kernel says ... "oh, you know how to deal with this" and allocates
> > > >    a new page, copying the 63 good cache lines from the old page and
> > > >    zeroing the missing one. New page is mapped to user.
> > >
> > > That could be one way of implementing it.  My understanding is that
> > > pmem devices will reallocate bad cachelines on writes, so a better
> > > implementation would be:
> > >
> > > 1) Kernel receives machine check
> > > 2) Kernel sends SIGBUS to the application
> > > 3) App send madvise(MADV_ZERO, addr, 1 << granularity)
> > > 4) Kernel does special writes to ensure the cacheline is zeroed
> > > 5) App does whatever it needs to recover (reconstructs the data or marks
> > > it as gone)
> >
> > Frankly, I've wondered why the filesystem shouldn't just be in charge of
> > all this--
> >
> > 1. kernel receives machine check
> > 2. kernel tattles to xfs
> > 3. xfs looks up which file(s) own the pmem range
> > 4. xfs zeroes the region, clears the poison, and sets AS_EIO on the
> >    files
>
> ... machine reboots, app restarts, gets no notification anything is wrong,
> treats zeroed region as good data, launches nuclear missiles.

Isn't AS_EIO stored persistently in the file block allocation map?
Even if it isn't today that is included in the proposal that the
filesystem maintains a list of poison that is coordinated with the
pmem driver.

>
> > 5. xfs sends SIGBUS to any programs that had those files mapped to tell
> >    them "Your data is gone, we've stabilized the storage you had
> >    mapped."
> > 6. app does whatever it needs to recover
> >
> > Apps shouldn't have to do this punch-and-reallocate dance, seeing as
> > they don't currently do that for SCSI disks and the like.
>
> The SCSI disk retains the error until the sector is rewritten.
> I'm not entirely sure whether you're trying to draw an analogy with
> error-in-page-cache or error-on-storage-medium.
>
> error-on-medium needs to persist until the app takes an affirmative step
> to clear it.  I presume XFS does not write zeroes to sectors with
> errors on SCSI disks ...

SCSI does not have an async mechanism to retrieve a list of poisoned
blocks from the hardware (that I know of), pmem does. I really think
we should not glom on pmem error handling semantics on top of the same
infrastructure that it has handling volatile / replaceable pages. When
the filesystem is enabled to get involved it should impose a different
model than generic memory error handling especially because generic
memory-error handling has no chance to solve the reflink problem.

If an application wants to survive poison consumption, signals seem
only sufficient for interrupting an application that needs to take
immediate action because one of its instructions was prevented from
making forward progress. The interface for enumerating the extent of
errors for DAX goes beyond what signinfo can reasonably convey, that
piece is where the filesystem can be called to discover which file
extents are impacted by poison.

I like Darrick's idea that the kernel stabilizes the storage by
default, and that the repair mechanism is just a write(2). I assume
"stabilize" means make sure that the file offset is permanently
recorded as poisoned until the next write(2), but read(2) and mmap(2)
return errors so no more machine checks are triggered.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC] Make the memory failure blast radius more precise
  2020-06-24 21:22         ` Jane Chu
@ 2020-06-25  0:13           ` Luck, Tony
  2020-06-25 16:23             ` Jane Chu
  0 siblings, 1 reply; 18+ messages in thread
From: Luck, Tony @ 2020-06-25  0:13 UTC (permalink / raw)
  To: Jane Chu, Matthew Wilcox
  Cc: Borislav Petkov, Naoya Horiguchi, linux-edac, linux-mm,
	linux-nvdimm, Darrick J. Wong, David Rientjes, Mike Kravetz,
	Dr. David Alan Gilbert, Peter Xu, Andrea Arcangeli

> Both the RFC patch and the above 5-step recovery plan look neat, step 4) 
> is nice to carry forward on icelake when a single instruction to clear
> poison is available.

Jane,

Clearing poison has some challenges.

On persistent memory it probably works (as the DIMM is going to remap that address to a different
part of the media to avoid the bad spot).

On DDR memory you'd need to decide whether the problem was transient, so that a simple
overwrite fixes the problem. Or persistent ... in which case the problem will likely come back
with the right data pattern.  To tell that you may need to run some memory test on the affected
area.

If the error was just in a 4K page, I'd be inclined to copy the good data to a new page and
map that in instead. Throwing away one 4K page isn't likely to be painful.

If it is in a 2M/1G page ... perhaps it is worth the effort and risk of trying to clear the poison
in place to avoid the pain of breaking up a large page.

-Tony

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-24 23:21             ` Dan Williams
@ 2020-06-25  0:17               ` Matthew Wilcox
  2020-06-25  1:18                 ` Dan Williams
  0 siblings, 1 reply; 18+ messages in thread
From: Matthew Wilcox @ 2020-06-25  0:17 UTC (permalink / raw)
  To: Dan Williams
  Cc: Darrick J. Wong, Luck, Tony, Borislav Petkov, Naoya Horiguchi,
	linux-edac, Linux MM, linux-nvdimm

On Wed, Jun 24, 2020 at 04:21:24PM -0700, Dan Williams wrote:
> On Wed, Jun 24, 2020 at 5:10 AM Matthew Wilcox <willy@infradead.org> wrote:
> > On Tue, Jun 23, 2020 at 05:01:24PM -0700, Darrick J. Wong wrote:
> > > Frankly, I've wondered why the filesystem shouldn't just be in charge of
> > > all this--
> > >
> > > 1. kernel receives machine check
> > > 2. kernel tattles to xfs
> > > 3. xfs looks up which file(s) own the pmem range
> > > 4. xfs zeroes the region, clears the poison, and sets AS_EIO on the
> > >    files
> >
> > ... machine reboots, app restarts, gets no notification anything is wrong,
> > treats zeroed region as good data, launches nuclear missiles.
> 
> Isn't AS_EIO stored persistently in the file block allocation map?

No.  AS_EIO is in mapping->flags.  Unless Darrick was using "sets AS_EIO"
as shorthand for something else.

> Even if it isn't today that is included in the proposal that the
> filesystem maintains a list of poison that is coordinated with the
> pmem driver.

I'd like to see a concrete proposal here.

> > > Apps shouldn't have to do this punch-and-reallocate dance, seeing as
> > > they don't currently do that for SCSI disks and the like.
> >
> > The SCSI disk retains the error until the sector is rewritten.
> > I'm not entirely sure whether you're trying to draw an analogy with
> > error-in-page-cache or error-on-storage-medium.
> >
> > error-on-medium needs to persist until the app takes an affirmative step
> > to clear it.  I presume XFS does not write zeroes to sectors with
> > errors on SCSI disks ...
> 
> SCSI does not have an async mechanism to retrieve a list of poisoned
> blocks from the hardware (that I know of), pmem does. I really think
> we should not glom on pmem error handling semantics on top of the same
> infrastructure that it has handling volatile / replaceable pages. When

Erm ... commit 6100e34b2526 has your name on it.

> the filesystem is enabled to get involved it should impose a different
> model than generic memory error handling especially because generic
> memory-error handling has no chance to solve the reflink problem.
> 
> If an application wants to survive poison consumption, signals seem
> only sufficient for interrupting an application that needs to take
> immediate action because one of its instructions was prevented from
> making forward progress. The interface for enumerating the extent of
> errors for DAX goes beyond what signinfo can reasonably convey, that
> piece is where the filesystem can be called to discover which file
> extents are impacted by poison.
> 
> I like Darrick's idea that the kernel stabilizes the storage by
> default, and that the repair mechanism is just a write(2). I assume
> "stabilize" means make sure that the file offset is permanently
> recorded as poisoned until the next write(2), but read(2) and mmap(2)
> return errors so no more machine checks are triggered.

That seems like something we'd want to work into the iomap infrastructure,
perhaps.  Add an IOMAP_POISONED to indicate this range needs to be
written before it can be read?


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-25  0:17               ` Matthew Wilcox
@ 2020-06-25  1:18                 ` Dan Williams
  0 siblings, 0 replies; 18+ messages in thread
From: Dan Williams @ 2020-06-25  1:18 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Darrick J. Wong, Luck, Tony, Borislav Petkov, Naoya Horiguchi,
	linux-edac, Linux MM, linux-nvdimm

On Wed, Jun 24, 2020 at 5:17 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Jun 24, 2020 at 04:21:24PM -0700, Dan Williams wrote:
> > On Wed, Jun 24, 2020 at 5:10 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > On Tue, Jun 23, 2020 at 05:01:24PM -0700, Darrick J. Wong wrote:
> > > > Frankly, I've wondered why the filesystem shouldn't just be in charge of
> > > > all this--
> > > >
> > > > 1. kernel receives machine check
> > > > 2. kernel tattles to xfs
> > > > 3. xfs looks up which file(s) own the pmem range
> > > > 4. xfs zeroes the region, clears the poison, and sets AS_EIO on the
> > > >    files
> > >
> > > ... machine reboots, app restarts, gets no notification anything is wrong,
> > > treats zeroed region as good data, launches nuclear missiles.
> >
> > Isn't AS_EIO stored persistently in the file block allocation map?
>
> No.  AS_EIO is in mapping->flags.  Unless Darrick was using "sets AS_EIO"
> as shorthand for something else.
>
> > Even if it isn't today that is included in the proposal that the
> > filesystem maintains a list of poison that is coordinated with the
> > pmem driver.
>
> I'd like to see a concrete proposal here.

There's still details to work through with respect to reflink. The
latest discussion was that thread I linked about how to solve the
page->index collision [1] for reverse mapping pages to files.

[1]: https://lore.kernel.org/linux-ext4/20200311063942.GE10776@dread.disaster.area/

>
> > > > Apps shouldn't have to do this punch-and-reallocate dance, seeing as
> > > > they don't currently do that for SCSI disks and the like.
> > >
> > > The SCSI disk retains the error until the sector is rewritten.
> > > I'm not entirely sure whether you're trying to draw an analogy with
> > > error-in-page-cache or error-on-storage-medium.
> > >
> > > error-on-medium needs to persist until the app takes an affirmative step
> > > to clear it.  I presume XFS does not write zeroes to sectors with
> > > errors on SCSI disks ...
> >
> > SCSI does not have an async mechanism to retrieve a list of poisoned
> > blocks from the hardware (that I know of), pmem does. I really think
> > we should not glom on pmem error handling semantics on top of the same
> > infrastructure that it has handling volatile / replaceable pages. When
>
> Erm ... commit 6100e34b2526 has your name on it.

Yes, and we're having this conversation because it turns out
mm/memory-failure.c enabling for DAX is insufficient.

>
> > the filesystem is enabled to get involved it should impose a different
> > model than generic memory error handling especially because generic
> > memory-error handling has no chance to solve the reflink problem.
> >
> > If an application wants to survive poison consumption, signals seem
> > only sufficient for interrupting an application that needs to take
> > immediate action because one of its instructions was prevented from
> > making forward progress. The interface for enumerating the extent of
> > errors for DAX goes beyond what signinfo can reasonably convey, that
> > piece is where the filesystem can be called to discover which file
> > extents are impacted by poison.
> >
> > I like Darrick's idea that the kernel stabilizes the storage by
> > default, and that the repair mechanism is just a write(2). I assume
> > "stabilize" means make sure that the file offset is permanently
> > recorded as poisoned until the next write(2), but read(2) and mmap(2)
> > return errors so no more machine checks are triggered.
>
> That seems like something we'd want to work into the iomap infrastructure,
> perhaps.  Add an IOMAP_POISONED to indicate this range needs to be
> written before it can be read?

Yes, an explicit error state for an extent range is needed for the fs
to offload the raw hardware poison list into software tracking.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-24  4:32   ` David Rientjes
  2020-06-24 20:57     ` Jane Chu
@ 2020-06-25  2:16     ` HORIGUCHI NAOYA(堀口 直也)
  1 sibling, 0 replies; 18+ messages in thread
From: HORIGUCHI NAOYA(堀口 直也) @ 2020-06-25  2:16 UTC (permalink / raw)
  To: David Rientjes
  Cc: Luck, Tony, Mike Kravetz, Dr. David Alan Gilbert, Peter Xu,
	Andrea Arcangeli, Matthew Wilcox, Borislav Petkov, linux-edac,
	linux-mm, linux-nvdimm, Darrick J. Wong, Jane Chu

On Tue, Jun 23, 2020 at 09:32:41PM -0700, David Rientjes wrote:
> On Tue, 23 Jun 2020, Luck, Tony wrote:
> 
> > > Hardware actually tells us the blast radius of the error, but we ignore
> > > it and take out the entire page.  We've had a customer request to know
> > > exactly how much of the page is damaged so they can avoid reconstructing
> > > an entire 2MB page if only a single cacheline is damaged.
> > > 
> > > This is only a strawman that I did in an hour or two; I'd appreciate
> > > architectural-level feedback.  Should I just convert memory_failure() to
> > > always take an address & granularity?  Should I create a struct to pass
> > > around (page, phys, granularity) instead of reconstructing the missing
> > > pieces in half a dozen functions?  Is this functionality welcome at all,
> > > or is the risk of upsetting applications which expect at least a page
> > > of granularity too high?
> > 
> > What is the interface to these applications that want finer granularity?
> > 
> > Current code does very poorly with hugetlbfs pages ... user loses the
> > whole 2 MB or 1GB. That's just silly (though I've been told that it is
> > hard to fix because allowing a hugetlbfs page to be broken up at an arbitrary
> > time as the result of a mahcine check means that the kernel needs locking
> > around a bunch of fas paths that currently assume that a huge page will
> > stay being a huge page).
> > 
> 
> Thanks for bringing this up, Tony.  Mike Kravetz pointed me to this thread 
> (thanks Mike!) so let's add him in explicitly as well as Andrea, Peter, 
> and David from Red Hat who we've been discussing an idea with that may 
> introduce exactly this needed support but for different purposes :)  The 
> timing of this thread is _uncanny_.
> 
> To improve the performance of userfaultfd for the purposes of post-copy 
> live migration we need to reduce the granularity in which pages are 
> migrated; we're looking at this from a 1GB gigantic page perspective but 
> the same arguments can likely be had for 2MB hugepages as well.  1GB pages 
> are too much of a bottleneck and, as you bring up, 1GB is simply too much 
> memory to poison :)  We don't have 1GB thp support so the big idea was to 
> introduce thp-like DoubleMap support into hugetlbfs for the purposes of 
> post-copy live migration and then I had the idea that this could be 
> extended to memory failure as well.
> 
> (We don't see the lack of 1GB thp here as a deficiency for anything other 
> than these two issues, hugetlb provides strong guarantees.)
> 
> I don't want to hijack Matthew's thread which is primarily about DAX, but 
> did get intrigued by your concerns about hugetlbfs page poisoning.  We can 
> fork the thread off here to discuss only the hugetlb application of this 
> if it makes sense to you or you'd like to collaborate on it as well.
> 
> The DoubleMap support would allow us to map the 1GB gigantic pages with 
> the PUD and the PMDs as well (and, further, the 2MB hugepages with the PMD 
> and PTEs) so that we can copy fragments into PMDs or PTEs and we don't 
> need to migrate the entire gigantic page.  Any access triggers #PF through 
> hugetlb_no_page() -> handle_userfault() which would trigger another 
> UFFDIO_COPY and map another fragment.
>
> Assume a world where this DoubleMap support already exists for hugetlb 
> pages today and all the invariants including page migration are fixed up 
> (since a PTE can now map a hugetlb page and a PMD can now map a gigantic 
> hugetlb page).  It *seems* like we'd be able to reduce the blast radius 
> here too on a hard memory failure: dissolve the gigantic page in place, 
> SIGBUS/SIGKILL on the bad PMD or PTE, and avoid poisoning the head of the 
> hugetlb page.  We agree that poisoning this large amount of memory is not 
> ideal :)
> 
> Anyway, this was some brainstorming that I was doing with Mike and the 
> others based on the idea of using DoubleMap support for post-copy live 
> migration.  If you would be interested or would like to collaborate on 
> it, we'd love to talk.

Thanks for proposing. I think that DoubleMap support could be a good
solution generally (not only for the usecase of post-copy live migration).
Splitting pud/pmd entry into pmd/pte entry makes smaller impact than migrating
all healthy data to somewhere else.  The implementation could be challenging
but not so as thp splitting because we don't have to consider collapsing.

Dax mapping seems to have similar issue. If we can share pmd mapping and pte
mapping to a dax file and covert the pmd mapping into pte mapping, we could
contain errors in smaller granularity for pmem.

Thanks,
Naoya Horiguchi

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Make the memory failure blast radius more precise
  2020-06-25  0:13           ` Luck, Tony
@ 2020-06-25 16:23             ` Jane Chu
  0 siblings, 0 replies; 18+ messages in thread
From: Jane Chu @ 2020-06-25 16:23 UTC (permalink / raw)
  To: Luck, Tony, Matthew Wilcox
  Cc: Borislav Petkov, Naoya Horiguchi, linux-edac, linux-mm,
	linux-nvdimm, Darrick J. Wong, David Rientjes, Mike Kravetz,
	Dr. David Alan Gilbert, Peter Xu, Andrea Arcangeli

On 6/24/2020 5:13 PM, Luck, Tony wrote:
>> Both the RFC patch and the above 5-step recovery plan look neat, step 4)
>> is nice to carry forward on icelake when a single instruction to clear
>> poison is available.
> 
> Jane,
> 
> Clearing poison has some challenges.
> 
> On persistent memory it probably works (as the DIMM is going to remap that address to a different
> part of the media to avoid the bad spot).
> 
> On DDR memory you'd need to decide whether the problem was transient, so that a simple
> overwrite fixes the problem. Or persistent ... in which case the problem will likely come back
> with the right data pattern.  To tell that you may need to run some memory test on the affected
> area.
> 
> If the error was just in a 4K page, I'd be inclined to copy the good data to a new page and
> map that in instead. Throwing away one 4K page isn't likely to be painful.
> 
> If it is in a 2M/1G page ... perhaps it is worth the effort and risk of trying to clear the poison
> in place to avoid the pain of breaking up a large page.

Thanks!  Yes I was only thinking about persistent memory, but 
memory_failure_dev_pagemap() applies to DDR as well depends on the 
underlying technology. In our use case, even if the error was just in a 
4K page, we'd like to clear the poison and reuse the page to maintain a 
contiguous 256MB extent in the filesystem.  Perhaps it is better to 
leave that to the filesystem and driver.

Regards,
-jane

> 
> -Tony
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-06-25 16:26 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-23 20:17 [RFC] Make the memory failure blast radius more precise Matthew Wilcox
2020-06-23 21:48 ` Dan Williams
2020-06-23 22:04 ` Luck, Tony
2020-06-23 22:17   ` Matthew Wilcox
2020-06-23 22:26     ` Luck, Tony
2020-06-23 22:40       ` Matthew Wilcox
2020-06-24  0:01         ` Darrick J. Wong
2020-06-24 12:10           ` Matthew Wilcox
2020-06-24 23:21             ` Dan Williams
2020-06-25  0:17               ` Matthew Wilcox
2020-06-25  1:18                 ` Dan Williams
2020-06-24 21:22         ` Jane Chu
2020-06-25  0:13           ` Luck, Tony
2020-06-25 16:23             ` Jane Chu
2020-06-24  4:32   ` David Rientjes
2020-06-24 20:57     ` Jane Chu
2020-06-24 22:01       ` David Rientjes
2020-06-25  2:16     ` HORIGUCHI NAOYA(堀口 直也)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).