From: Matthew Wilcox <willy@infradead.org>
To: Tony Luck <tony.luck@intel.com>, Borislav Petkov <bp@alien8.de>,
Naoya Horiguchi <naoya.horiguchi@nec.com>,
linux-edac@vger.kernel.org, linux-mm@kvack.org,
linux-nvdimm@lists.01.org,
"Darrick J. Wong" <darrick.wong@oracle.com>,
Jane Chu <jane.chu@oracle.com>
Subject: [RFC] Make the memory failure blast radius more precise
Date: Tue, 23 Jun 2020 21:17:45 +0100 [thread overview]
Message-ID: <20200623201745.GG21350@casper.infradead.org> (raw)
Hardware actually tells us the blast radius of the error, but we ignore
it and take out the entire page. We've had a customer request to know
exactly how much of the page is damaged so they can avoid reconstructing
an entire 2MB page if only a single cacheline is damaged.
This is only a strawman that I did in an hour or two; I'd appreciate
architectural-level feedback. Should I just convert memory_failure() to
always take an address & granularity? Should I create a struct to pass
around (page, phys, granularity) instead of reconstructing the missing
pieces in half a dozen functions? Is this functionality welcome at all,
or is the risk of upsetting applications which expect at least a page
of granularity too high?
I can see places where I've specified a plain PAGE_SHIFT insted of
interrogating a compound page for its size. I'd probably split this
patch up into two or three pieces for applying.
I've also blindly taken out the call to unmap_mapping_range(). Again,
the customer requested that we not do this. That deserves to be in its
own patch and properly justified.
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ce9120c4f740..09978c5b9493 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -463,7 +463,7 @@ static void mce_irq_work_cb(struct irq_work *entry)
* Check if the address reported by the CPU is in a format we can parse.
* It would be possible to add code for most other cases, but all would
* be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
+ * parser). So only support physical addresses up to page granularity for now.
*/
int mce_usable_address(struct mce *m)
{
@@ -570,7 +570,6 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
- unsigned long pfn;
if (!mce || !mce_usable_address(mce))
return NOTIFY_DONE;
@@ -579,9 +578,8 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
mce->severity != MCE_DEFERRED_SEVERITY)
return NOTIFY_DONE;
- pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0)) {
- set_mce_nospec(pfn, whole_page(mce));
+ if (!__memory_failure(mce->addr, MCI_MISC_ADDR_LSB(mce->misc), 0)) {
+ set_mce_nospec(mce->addr >> PAGE_SHIFT, whole_page(mce));
mce->kflags |= MCE_HANDLED_UC;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc7b87310c10..f2ea6491df28 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3035,7 +3035,14 @@ enum mf_flags {
MF_MUST_KILL = 1 << 2,
MF_SOFT_OFFLINE = 1 << 3,
};
-extern int memory_failure(unsigned long pfn, int flags);
+
+int __memory_failure(unsigned long addr, unsigned int granularity, int flags);
+
+static inline int memory_failure(unsigned long pfn, int flags)
+{
+ return __memory_failure(pfn << PAGE_SHIFT, PAGE_SHIFT, flags);
+}
+
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b8ccb1fb9b..775b053dcd98 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -198,7 +198,7 @@ struct to_kill {
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- short size_shift;
+ short granularity;
};
/*
@@ -209,7 +209,7 @@ struct to_kill {
static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
{
struct task_struct *t = tk->tsk;
- short addr_lsb = tk->size_shift;
+ short addr_lsb = tk->granularity;
int ret = 0;
pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
@@ -262,40 +262,6 @@ void shake_page(struct page *p, int access)
}
EXPORT_SYMBOL_GPL(shake_page);
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
-{
- unsigned long address = vma_address(page, vma);
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return 0;
- p4d = p4d_offset(pgd, address);
- if (!p4d_present(*p4d))
- return 0;
- pud = pud_offset(p4d, address);
- if (!pud_present(*pud))
- return 0;
- if (pud_devmap(*pud))
- return PUD_SHIFT;
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return 0;
- if (pmd_devmap(*pmd))
- return PMD_SHIFT;
- pte = pte_offset_map(pmd, address);
- if (!pte_present(*pte))
- return 0;
- if (pte_devmap(*pte))
- return PAGE_SHIFT;
- return 0;
-}
-
/*
* Failure handling: if we can't find or can't kill a process there's
* not much we can do. We just print a message and ignore otherwise.
@@ -306,8 +272,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ unsigned int offset, unsigned int granularity,
+ struct vm_area_struct *vma, struct list_head *to_kill)
{
struct to_kill *tk;
@@ -318,27 +284,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}
tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
- tk->size_shift = page_shift(compound_head(p));
+ tk->granularity = granularity;
- /*
- * Send SIGKILL if "tk->addr == -EFAULT". Also, as
- * "tk->size_shift" is always non-zero for !is_zone_device_page(),
- * so "tk->size_shift == 0" effectively checks no mapping on
- * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
- * to a process' address space, it's possible not all N VMAs
- * contain mappings for the page, but at least one VMA does.
- * Only deliver SIGBUS with payload derived from the VMA that
- * has a mapping for the page.
- */
+ /* Send SIGKILL if "tk->addr == -EFAULT". */
if (tk->addr == -EFAULT) {
pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
- } else if (tk->size_shift == 0) {
- kfree(tk);
- return;
+ } else {
+ tk->addr += offset;
}
get_task_struct(tsk);
@@ -468,7 +421,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, PAGE_SHIFT, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -478,8 +432,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
/*
* Collect processes when the error hit a file mapped page.
*/
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
- int force_early)
+static void collect_procs_file(struct page *page, unsigned int offset,
+ short granularity, struct list_head *to_kill, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -503,7 +457,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, PAGE_SHIFT, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -513,8 +468,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
/*
* Collect the processes who have the corrupted page mapped to kill.
*/
-static void collect_procs(struct page *page, struct list_head *tokill,
- int force_early)
+static void collect_procs(struct page *page, unsigned int offset,
+ short granularity, struct list_head *tokill, int force_early)
{
if (!page->mapping)
return;
@@ -522,7 +477,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
if (PageAnon(page))
collect_procs_anon(page, tokill, force_early);
else
- collect_procs_file(page, tokill, force_early);
+ collect_procs_file(page, offset, granularity, tokill,
+ force_early);
}
static const char *action_name[] = {
@@ -1026,7 +982,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, 0, PAGE_SHIFT, &tokill,
+ flags & MF_ACTION_REQUIRED);
if (!PageHuge(hpage)) {
unmap_success = try_to_unmap(hpage, ttu);
@@ -1176,16 +1133,14 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
return res;
}
-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
- struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap(unsigned long addr, short granularity,
+ int flags, struct dev_pagemap *pgmap)
{
+ unsigned long pfn = addr >> PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
const bool unmap_success = true;
- unsigned long size = 0;
- struct to_kill *tk;
LIST_HEAD(tokill);
int rc = -EBUSY;
- loff_t start;
dax_entry_t cookie;
/*
@@ -1225,21 +1180,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* SIGBUS (i.e. MF_MUST_KILL)
*/
flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
-
- list_for_each_entry(tk, &tokill, nd)
- if (tk->size_shift)
- size = max(size, 1UL << tk->size_shift);
- if (size) {
- /*
- * Unmap the largest mapping to avoid breaking up
- * device-dax mappings which are constant size. The
- * actual size of the mapping being torn down is
- * communicated in siginfo, see kill_proc()
- */
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
- }
+ collect_procs(page, offset_in_page(addr), granularity, &tokill,
+ flags & MF_ACTION_REQUIRED);
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
unlock:
@@ -1252,8 +1194,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
}
/**
- * memory_failure - Handle memory failure of a page.
- * @pfn: Page Number of the corrupted page
+ * __memory_failure - Handle memory failure.
+ * @addr: Physical address of the error.
+ * @granularity: Number of bits in the address which are bad.
* @flags: fine tune action taken
*
* This function is called by the low level machine check code
@@ -1268,8 +1211,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* Must run in process context (e.g. a work queue) with interrupts
* enabled and no spinlocks hold.
*/
-int memory_failure(unsigned long pfn, int flags)
+int __memory_failure(unsigned long addr, unsigned int granularity, int flags)
{
+ unsigned long pfn = addr >> PAGE_SHIFT;
struct page *p;
struct page *hpage;
struct page *orig_head;
@@ -1285,8 +1229,8 @@ int memory_failure(unsigned long pfn, int flags)
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn, NULL);
if (pgmap)
- return memory_failure_dev_pagemap(pfn, flags,
- pgmap);
+ return memory_failure_dev_pagemap(addr,
+ granularity, flags, pgmap);
}
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);
@@ -1442,7 +1386,7 @@ int memory_failure(unsigned long pfn, int flags)
unlock_page(p);
return res;
}
-EXPORT_SYMBOL_GPL(memory_failure);
+EXPORT_SYMBOL_GPL(__memory_failure);
#define MEMORY_FAILURE_FIFO_ORDER 4
#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
next reply other threads:[~2020-06-23 21:26 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-06-23 20:17 Matthew Wilcox [this message]
2020-06-23 21:48 ` [RFC] Make the memory failure blast radius more precise Dan Williams
2020-06-23 22:04 ` Luck, Tony
2020-06-23 22:17 ` Matthew Wilcox
2020-06-23 22:26 ` Luck, Tony
2020-06-23 22:40 ` Matthew Wilcox
2020-06-24 0:01 ` Darrick J. Wong
2020-06-24 12:10 ` Matthew Wilcox
2020-06-24 23:21 ` Dan Williams
2020-06-25 0:17 ` Matthew Wilcox
2020-06-25 1:18 ` Dan Williams
2020-06-24 21:22 ` Jane Chu
2020-06-25 0:13 ` Luck, Tony
2020-06-25 16:23 ` Jane Chu
2020-06-24 4:32 ` David Rientjes
2020-06-24 20:57 ` Jane Chu
2020-06-24 22:01 ` David Rientjes
2020-06-25 2:16 ` HORIGUCHI NAOYA(堀口 直也)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200623201745.GG21350@casper.infradead.org \
--to=willy@infradead.org \
--cc=bp@alien8.de \
--cc=darrick.wong@oracle.com \
--cc=jane.chu@oracle.com \
--cc=linux-edac@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nvdimm@lists.01.org \
--cc=naoya.horiguchi@nec.com \
--cc=tony.luck@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).