Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / Atom feed
From: James Morse <james.morse@arm.com>
To: linux-acpi@vger.kernel.org
Cc: Rafael Wysocki <rjw@rjwysocki.net>,
	Tony Luck <tony.luck@intel.com>, Xie XiuQi <xiexiuqi@huawei.com>,
	Marc Zyngier <marc.zyngier@arm.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will.deacon@arm.com>,
	Christoffer Dall <christoffer.dall@arm.com>,
	Dongjiu Geng <gengdongjiu@huawei.com>,
	linux-mm@kvack.org, Borislav Petkov <bp@alien8.de>,
	james.morse@arm.com, Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>,
	kvmarm@lists.cs.columbia.edu,
	linux-arm-kernel@lists.infradead.org, Len Brown <lenb@kernel.org>
Subject: [PATCH v8 23/26] ACPI / APEI: Kick the memory_failure() queue for synchronous errors
Date: Tue, 29 Jan 2019 18:48:59 +0000
Message-ID: <20190129184902.102850-24-james.morse@arm.com> (raw)
In-Reply-To: <20190129184902.102850-1-james.morse@arm.com>

memory_failure() offlines or repairs pages of memory that have been
discovered to be corrupt. These may be detected by an external
component, (e.g. the memory controller), and notified via an IRQ.
In this case the work is queued as not all of memory_failure()s work
can happen in IRQ context.

If the error was detected as a result of user-space accessing a
corrupt memory location the CPU may take an abort instead. On arm64
this is a 'synchronous external abort', and on a firmware first
system it is replayed using NOTIFY_SEA.

This notification has NMI like properties, (it can interrupt
IRQ-masked code), so the memory_failure() work is queued. If we
return to user-space before the queued memory_failure() work is
processed, we will take the fault again. This loop may cause platform
firmware to exceed some threshold and reboot when Linux could have
recovered from this error.

For NMIlike notifications keep track of whether memory_failure() work
was queued, and make task_work pending to flush out the queue.
To save memory allocations, the task_work is allocated as part of
the ghes_estatus_node, and free()ing it back to the pool is deferred.

Signed-off-by: James Morse <james.morse@arm.com>

---
current->mm == &init_mm ? I couldn't find a helper for this.
The intent is not to set TIF flags on kernel threads. What happens
if a kernel-thread takes on of these? Its just one of the many
not-handled-very-well cases we have already, as memory_failure()
puts it: "try to be lucky".

I assume that if NOTIFY_NMI is coming from SMM it must suffer from
this problem too.

Changes since v7:
 * Don't allocate memory, stuff it in estatus_node. This means passing
   back a 'queued' flag to ghes_proc_in_irq() which holds the estatus_node.
---
 drivers/acpi/apei/ghes.c | 68 +++++++++++++++++++++++++++++++++-------
 include/acpi/ghes.h      |  3 ++
 2 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e6f0d176b245..dfa8f155f964 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -47,6 +47,7 @@
 #include <linux/sched/clock.h>
 #include <linux/uuid.h>
 #include <linux/ras.h>
+#include <linux/task_work.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -399,23 +400,46 @@ static void ghes_clear_estatus(struct ghes *ghes,
 		ghes_ack_error(ghes->generic_v2);
 }
 
-static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
+/*
+ * Called as task_work before returning to user-space.
+ * Ensure any queued work has been done before we return to the context that
+ * triggered the notification.
+ */
+static void ghes_kick_task_work(struct callback_head *head)
+{
+	struct acpi_hest_generic_status *estatus;
+	struct ghes_estatus_node *estatus_node;
+	u32 node_len;
+
+	estatus_node = container_of(head, struct ghes_estatus_node, task_work);
+	memory_failure_queue_kick(estatus_node->task_work_cpu);
+
+	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+	node_len = GHES_ESTATUS_NODE_LEN(cper_estatus_len(estatus));
+	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+}
+
+static bool ghes_handle_memory_failure(struct ghes *ghes,
+				       struct acpi_hest_generic_data *gdata,
+				       int sev)
 {
-#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
 	unsigned long pfn;
 	int flags = -1;
 	int sec_sev = ghes_severity(gdata->error_severity);
 	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
+	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
+		return false;
+
 	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
-		return;
+		return false;
 
 	pfn = mem_err->physical_addr >> PAGE_SHIFT;
 	if (!pfn_valid(pfn)) {
 		pr_warn_ratelimited(FW_WARN GHES_PFX
 		"Invalid address in generic error data: %#llx\n",
 		mem_err->physical_addr);
-		return;
+		return false;
 	}
 
 	/* iff following two events can be handled properly by now */
@@ -425,9 +449,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
 	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
 		flags = 0;
 
-	if (flags != -1)
+	if (flags != -1) {
 		memory_failure_queue(pfn, flags);
-#endif
+		return true;
+	}
+
+	return false;
 }
 
 /*
@@ -475,11 +502,12 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 #endif
 }
 
-static void ghes_do_proc(struct ghes *ghes,
+static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
 	int sev, sec_sev;
 	struct acpi_hest_generic_data *gdata;
+	bool work_queued = false;
 	guid_t *sec_type;
 	guid_t *fru_id = &NULL_UUID_LE;
 	char *fru_text = "";
@@ -500,7 +528,8 @@ static void ghes_do_proc(struct ghes *ghes,
 			ghes_edac_report_mem_error(sev, mem_err);
 
 			arch_apei_report_mem_error(sev, mem_err);
-			ghes_handle_memory_failure(gdata, sev);
+			if (ghes_handle_memory_failure(ghes, gdata, sev))
+				work_queued = true;
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
 			ghes_handle_aer(gdata);
@@ -517,6 +546,8 @@ static void ghes_do_proc(struct ghes *ghes,
 					       gdata->error_data_length);
 		}
 	}
+
+	return work_queued;
 }
 
 static void __ghes_print_estatus(const char *pfx,
@@ -812,7 +843,9 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
 	struct ghes_estatus_node *estatus_node;
 	struct acpi_hest_generic *generic;
 	struct acpi_hest_generic_status *estatus;
+	bool task_work_pending;
 	u32 len, node_len;
+	int ret;
 
 	llnode = llist_del_all(&ghes_estatus_llist);
 	/*
@@ -827,14 +860,26 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
 		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
 		len = cper_estatus_len(estatus);
 		node_len = GHES_ESTATUS_NODE_LEN(len);
-		ghes_do_proc(estatus_node->ghes, estatus);
+		task_work_pending = ghes_do_proc(estatus_node->ghes, estatus);
 		if (!ghes_estatus_cached(estatus)) {
 			generic = estatus_node->generic;
 			if (ghes_print_estatus(NULL, generic, estatus))
 				ghes_estatus_cache_add(generic, estatus);
 		}
-		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
-			      node_len);
+
+		if (task_work_pending && current->mm != &init_mm) {
+			estatus_node->task_work.func = ghes_kick_task_work;
+			estatus_node->task_work_cpu = smp_processor_id();
+			ret = task_work_add(current, &estatus_node->task_work,
+					    true);
+			if (ret)
+				estatus_node->task_work.func = NULL;
+		}
+
+		if (!estatus_node->task_work.func)
+			gen_pool_free(ghes_estatus_pool,
+				      (unsigned long)estatus_node, node_len);
+
 		llnode = next;
 	}
 }
@@ -894,6 +939,7 @@ static int ghes_in_nmi_queue_one_entry(struct ghes *ghes,
 
 	estatus_node->ghes = ghes;
 	estatus_node->generic = ghes->generic;
+	estatus_node->task_work.func = NULL;
 	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
 
 	if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index e3f1cddb4ac8..517a5231cc1b 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -33,6 +33,9 @@ struct ghes_estatus_node {
 	struct llist_node llnode;
 	struct acpi_hest_generic *generic;
 	struct ghes *ghes;
+
+	int task_work_cpu;
+	struct callback_head task_work;
 };
 
 struct ghes_estatus_cache {
-- 
2.20.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

  parent reply index

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-29 18:48 [PATCH v8 00/26] APEI in_nmi() rework and SDEI wire-up James Morse
2019-01-29 18:48 ` [PATCH v8 01/26] ACPI / APEI: Don't wait to serialise with oops messages when panic()ing James Morse
2019-01-29 18:48 ` [PATCH v8 02/26] ACPI / APEI: Remove silent flag from ghes_read_estatus() James Morse
2019-01-29 18:48 ` [PATCH v8 03/26] ACPI / APEI: Switch estatus pool to use vmalloc memory James Morse
2019-01-29 18:48 ` [PATCH v8 04/26] ACPI / APEI: Make hest.c manage the estatus memory pool James Morse
2019-02-01 13:20   ` Borislav Petkov
2019-01-29 18:48 ` [PATCH v8 05/26] ACPI / APEI: Make estatus pool allocation a static size James Morse
2019-01-29 18:48 ` [PATCH v8 06/26] ACPI / APEI: Don't store CPER records physical address in struct ghes James Morse
2019-01-29 18:48 ` [PATCH v8 07/26] ACPI / APEI: Remove spurious GHES_TO_CLEAR check James Morse
2019-01-29 18:48 ` [PATCH v8 08/26] ACPI / APEI: Don't update struct ghes' flags in read/clear estatus James Morse
2019-01-29 18:48 ` [PATCH v8 09/26] ACPI / APEI: Generalise the estatus queue's notify code James Morse
2019-02-01 13:46   ` Borislav Petkov
2019-01-29 18:48 ` [PATCH v8 10/26] ACPI / APEI: Don't allow ghes_ack_error() to mask earlier errors James Morse
2019-01-29 18:48 ` [PATCH v8 11/26] ACPI / APEI: Move NOTIFY_SEA between the estatus-queue and NOTIFY_NMI James Morse
2019-01-29 18:48 ` [PATCH v8 12/26] ACPI / APEI: Switch NOTIFY_SEA to use the estatus queue James Morse
2019-01-29 18:48 ` [PATCH v8 13/26] KVM: arm/arm64: Add kvm_ras.h to collect kvm specific RAS plumbing James Morse
2019-01-29 18:48 ` [PATCH v8 14/26] arm64: KVM/mm: Move SEA handling behind a single 'claim' interface James Morse
2019-01-29 18:48 ` [PATCH v8 15/26] ACPI / APEI: Move locking to the notification helper James Morse
2019-01-29 18:48 ` [PATCH v8 16/26] ACPI / APEI: Let the notification helper specify the fixmap slot James Morse
2019-01-29 18:48 ` [PATCH v8 17/26] ACPI / APEI: Pass ghes and estatus separately to avoid a later copy James Morse
2019-01-29 18:48 ` [PATCH v8 18/26] ACPI / APEI: Make GHES estatus header validation more user friendly James Morse
2019-02-01 14:30   ` Borislav Petkov
2019-01-29 18:48 ` [PATCH v8 19/26] ACPI / APEI: Split ghes_read_estatus() to allow a peek at the CPER length James Morse
2019-01-29 18:48 ` [PATCH v8 20/26] ACPI / APEI: Only use queued estatus entry during in_nmi_queue_one_entry() James Morse
2019-01-29 18:48 ` [PATCH v8 21/26] ACPI / APEI: Use separate fixmap pages for arm64 NMI-like notifications James Morse
2019-01-29 18:48 ` [PATCH v8 22/26] mm/memory-failure: Add memory_failure_queue_kick() James Morse
2019-01-29 18:48 ` James Morse [this message]
2019-01-29 18:49 ` [PATCH v8 24/26] arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work James Morse
2019-01-30  8:56   ` Julien Thierry
2019-01-29 18:49 ` [PATCH v8 25/26] firmware: arm_sdei: Add ACPI GHES registration helper James Morse
2019-01-29 18:49 ` [PATCH v8 26/26] ACPI / APEI: Add support for the SDEI GHES Notification type James Morse
2019-02-08 11:40 ` [PATCH v8 00/26] APEI in_nmi() rework and SDEI wire-up Rafael J. Wysocki
2019-02-08 14:13   ` James Morse
2019-02-11 11:05     ` Rafael J. Wysocki
2019-02-11 18:35       ` James Morse
2019-02-12 22:14         ` Rafael J. Wysocki

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190129184902.102850-24-james.morse@arm.com \
    --to=james.morse@arm.com \
    --cc=bp@alien8.de \
    --cc=catalin.marinas@arm.com \
    --cc=christoffer.dall@arm.com \
    --cc=gengdongjiu@huawei.com \
    --cc=kvmarm@lists.cs.columbia.edu \
    --cc=lenb@kernel.org \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-mm@kvack.org \
    --cc=marc.zyngier@arm.com \
    --cc=n-horiguchi@ah.jp.nec.com \
    --cc=rjw@rjwysocki.net \
    --cc=tony.luck@intel.com \
    --cc=will.deacon@arm.com \
    --cc=xiexiuqi@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-ARM-Kernel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-arm-kernel/0 linux-arm-kernel/git/0.git
	git clone --mirror https://lore.kernel.org/linux-arm-kernel/1 linux-arm-kernel/git/1.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-arm-kernel linux-arm-kernel/ https://lore.kernel.org/linux-arm-kernel \
		linux-arm-kernel@lists.infradead.org infradead-linux-arm-kernel@archiver.kernel.org
	public-inbox-index linux-arm-kernel


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.infradead.lists.linux-arm-kernel


AGPL code for this site: git clone https://public-inbox.org/ public-inbox