linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
@ 2021-11-08  8:38 Ganesh Goudar
  2021-11-08  8:38 ` [PATCH 2/2] pseries/mce: Refactor the pseries mce handling code Ganesh Goudar
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Ganesh Goudar @ 2021-11-08  8:38 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Ganesh Goudar, mahesh, npiggin

In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
 arch/powerpc/include/asm/machdep.h       |  2 +
 arch/powerpc/include/asm/mce.h           |  2 +
 arch/powerpc/include/asm/paca.h          |  1 +
 arch/powerpc/kernel/mce.c                | 51 +++++++++++-------------
 arch/powerpc/kernel/time.c               |  3 ++
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c     | 31 +-------------
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 764f2732a821..c89cc03c0f97 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -103,6 +103,8 @@ struct machdep_calls {
 	/* Called during machine check exception to retrive fixup address. */
 	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
 
+	void            (*machine_check_log_err)(void);
+
 	/* Motherboard/chipset features. This is a kind of general purpose
 	 * hook used to control some machine specific features (like reset
 	 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..187810f13669 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
 				  struct mce_error_info *mce_err);
+extern void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..f49180f8c9be 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct mce_info *mce_info;
+	atomic_t mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } ____cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..45baa062ebc0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-        .func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-	.func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce,
 	}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+	set_dec(1);
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	if (mce->error_type == MCE_ERROR_TYPE_UE)
 		mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+	atomic_inc(&local_paca->mces_to_process);
+
 	if (!addr)
 		return;
 
@@ -217,7 +215,7 @@ void release_mce_event(void)
 	get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
 	schedule_work(&mce_ue_event_work);
 }
@@ -239,7 +237,7 @@ static void machine_check_ue_event(struct machine_check_event *evt)
 	       evt, sizeof(*evt));
 
 	/* Queue work to process this event later. */
-	irq_work_queue(&mce_ue_event_irq_work);
+	machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@ void machine_check_queue_event(void)
 {
 	int index;
 	struct machine_check_event evt;
-	unsigned long msr;
 
 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
 		return;
@@ -263,20 +260,7 @@ void machine_check_queue_event(void)
 	memcpy(&local_paca->mce_info->mce_event_queue[index],
 	       &evt, sizeof(evt));
 
-	/*
-	 * Queue irq work to process this event later. Before
-	 * queuing the work enable translation for non radix LPAR,
-	 * as irq_work_queue may try to access memory outside RMO
-	 * region.
-	 */
-	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
-		msr = mfmsr();
-		mtmsr(msr | MSR_IR | MSR_DR);
-		irq_work_queue(&mce_event_process_work);
-		mtmsr(msr);
-	} else {
-		irq_work_queue(&mce_event_process_work);
-	}
+	machine_check_raise_dec_intr();
 }
 
 void mce_common_process_ue(struct pt_regs *regs,
@@ -338,7 +322,7 @@ static void machine_process_ue_event(struct work_struct *work)
  * process pending MCE event from the mce event queue. This function will be
  * called during syscall exit.
  */
-static void machine_check_process_queued_event(struct irq_work *work)
+static void machine_check_process_queued_event(void)
 {
 	int index;
 	struct machine_check_event *evt;
@@ -363,6 +347,17 @@ static void machine_check_process_queued_event(struct irq_work *work)
 	}
 }
 
+void mce_run_late_handlers(void)
+{
+	if (unlikely(atomic_read(&local_paca->mces_to_process))) {
+		if (ppc_md.machine_check_log_err)
+			ppc_md.machine_check_log_err();
+		machine_check_process_queued_event();
+		machine_check_ue_work();
+		atomic_dec(&local_paca->mces_to_process);
+	}
+}
+
 void machine_check_print_event_info(struct machine_check_event *evt,
 				    bool user_mode, bool in_guest)
 {
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 934d8ae66cc6..2dc09d75d77c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
 		irq_work_run();
 	}
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	mce_run_late_handlers();
+#endif
 	now = get_tb();
 	if (now >= *next_tb) {
 		*next_tb = ~(u64)0;
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 3544778e06d0..0dc4f1027b30 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -21,6 +21,7 @@ struct pt_regs;
 extern int pSeries_system_reset_exception(struct pt_regs *regs);
 extern int pSeries_machine_check_exception(struct pt_regs *regs);
 extern long pseries_machine_check_realmode(struct pt_regs *regs);
+extern void pSeries_machine_check_log_err(void);
 
 #ifdef CONFIG_SMP
 extern void smp_init_pseries(void);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 56092dccfdb8..8613f9cc5798 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock);
 
 static int ras_check_exception_token;
 
-static void mce_process_errlog_event(struct irq_work *work);
-static struct irq_work mce_errlog_process_work = {
-	.func = mce_process_errlog_event,
-};
-
 #define EPOW_SENSOR_TOKEN	9
 #define EPOW_SENSOR_INDEX	0
 
@@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 	error_type = mce_log->error_type;
 
 	disposition = mce_handle_err_realmode(disposition, error_type);
-
-	/*
-	 * Enable translation as we will be accessing per-cpu variables
-	 * in save_mce_event() which may fall outside RMO region, also
-	 * leave it enabled because subsequently we will be queuing work
-	 * to workqueues where again per-cpu variables accessed, besides
-	 * fwnmi_release_errinfo() crashes when called in realmode on
-	 * pseries.
-	 * Note: All the realmode handling like flushing SLB entries for
-	 *       SLB multihit is done by now.
-	 */
 out:
-	msr = mfmsr();
-	mtmsr(msr | MSR_IR | MSR_DR);
-
 	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
 					      disposition);
-
-	/*
-	 * Queue irq work to log this rtas event later.
-	 * irq_work_queue uses per-cpu variables, so do this in virt
-	 * mode as well.
-	 */
-	irq_work_queue(&mce_errlog_process_work);
-
-	mtmsr(msr);
-
 	return disposition;
 }
 
 /*
  * Process MCE rtas errlog event.
  */
-static void mce_process_errlog_event(struct irq_work *work)
+void pSeries_machine_check_log_err(void)
 {
 	struct rtas_error_log *err;
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f79126f16258..54bd7bdb7e92 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -1085,6 +1085,7 @@ define_machine(pseries) {
 	.system_reset_exception = pSeries_system_reset_exception,
 	.machine_check_early	= pseries_machine_check_realmode,
 	.machine_check_exception = pSeries_machine_check_exception,
+	.machine_check_log_err	= pSeries_machine_check_log_err,
 #ifdef CONFIG_KEXEC_CORE
 	.machine_kexec          = pSeries_machine_kexec,
 	.kexec_cpu_down         = pseries_kexec_cpu_down,
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] pseries/mce: Refactor the pseries mce handling code
  2021-11-08  8:38 [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Ganesh Goudar
@ 2021-11-08  8:38 ` Ganesh Goudar
  2021-11-08 14:19 ` [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Nicholas Piggin
  2021-11-12  7:12 ` Daniel Axtens
  2 siblings, 0 replies; 6+ messages in thread
From: Ganesh Goudar @ 2021-11-08  8:38 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Ganesh Goudar, mahesh, npiggin

Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++++++++++----------------
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-	if (disposition == RTAS_DISP_NOT_RECOVERED) {
-		switch (error_type) {
-		case	MC_ERROR_TYPE_ERAT:
-			flush_erat();
-			disposition = RTAS_DISP_FULLY_RECOVERED;
-			break;
-		case	MC_ERROR_TYPE_SLB:
-			/*
-			 * Store the old slb content in paca before flushing.
-			 * Print this when we go to virtual mode.
-			 * There are chances that we may hit MCE again if there
-			 * is a parity error on the SLB entry we trying to read
-			 * for saving. Hence limit the slb saving to single
-			 * level of recursion.
-			 */
-			if (local_paca->in_mce == 1)
-				slb_save_contents(local_paca->mce_faulty_slbs);
-			flush_and_reload_slb();
-			disposition = RTAS_DISP_FULLY_RECOVERED;
-			break;
-		default:
-			break;
-		}
-	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-		/* Platform corrected itself but could be degraded */
-		pr_err("MCE: limited recovery, system may be degraded\n");
-		disposition = RTAS_DISP_FULLY_RECOVERED;
-	}
-#endif
-	return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-				   struct rtas_error_log *errp,
-				   struct pseries_mc_errorlog *mce_log,
-				   int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
 	struct mce_error_info mce_err = { 0 };
+	unsigned long eaddr = 0, paddr = 0;
+	struct pseries_errorlog *pseries_log;
+	struct pseries_mc_errorlog *mce_log;
+	int disposition = rtas_error_disposition(errp);
 	int initiator = rtas_error_initiator(errp);
 	int severity = rtas_error_severity(errp);
-	unsigned long eaddr = 0, paddr = 0;
 	u8 error_type, err_sub_type;
 
-	if (!mce_log)
-		goto out;
-
-	error_type = mce_log->error_type;
-	err_sub_type = rtas_mc_error_sub_type(mce_log);
-
 	if (initiator == RTAS_INITIATOR_UNKNOWN)
 		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
 	else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
 		mce_err.severity = MCE_SEV_SEVERE;
 	else if (severity == RTAS_SEVERITY_ERROR)
 		mce_err.severity = MCE_SEV_SEVERE;
+	else if (severity == RTAS_SEVERITY_FATAL)
+		mce_err.severity = MCE_SEV_FATAL;
 	else
 		mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
 	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 	mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-	switch (error_type) {
+	if (!rtas_error_extended(errp))
+		goto out;
+
+	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+	if (!pseries_log)
+		goto out;
+
+	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+	error_type = mce_log->error_type;
+	err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+	switch (mce_log->error_type) {
 	case MC_ERROR_TYPE_UE:
 		mce_err.error_type = MCE_ERROR_TYPE_UE;
 		mce_common_process_ue(regs, &mce_err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
 		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
 		break;
 	case MC_ERROR_TYPE_I_CACHE:
-		mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
 		break;
 	case MC_ERROR_TYPE_UNKNOWN:
 	default:
 		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 		break;
 	}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (disposition == RTAS_DISP_NOT_RECOVERED) {
+		switch (error_type) {
+		case	MC_ERROR_TYPE_SLB:
+		case	MC_ERROR_TYPE_ERAT:
+			/*
+			 * Store the old slb content in paca before flushing.
+			 * Print this when we go to virtual mode.
+			 * There are chances that we may hit MCE again if there
+			 * is a parity error on the SLB entry we trying to read
+			 * for saving. Hence limit the slb saving to single
+			 * level of recursion.
+			 */
+			if (local_paca->in_mce == 1)
+				slb_save_contents(local_paca->mce_faulty_slbs);
+			flush_and_reload_slb();
+			disposition = RTAS_DISP_FULLY_RECOVERED;
+			break;
+		default:
+			break;
+		}
+	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		pr_err("MCE: limited recovery, system may be degraded\n");
+		disposition = RTAS_DISP_FULLY_RECOVERED;
+	}
+#endif
 out:
 	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
-		       &mce_err, regs->nip, eaddr, paddr);
-	return disposition;
-}
+			&mce_err, regs->nip, eaddr, paddr);
 
-static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
-{
-	struct pseries_errorlog *pseries_log;
-	struct pseries_mc_errorlog *mce_log = NULL;
-	int disposition = rtas_error_disposition(errp);
-	unsigned long msr;
-	u8 error_type;
-
-	if (!rtas_error_extended(errp))
-		goto out;
-
-	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
-	if (!pseries_log)
-		goto out;
-
-	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-	error_type = mce_log->error_type;
-
-	disposition = mce_handle_err_realmode(disposition, error_type);
-out:
-	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
-					      disposition);
 	return disposition;
 }
 
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
  2021-11-08  8:38 [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Ganesh Goudar
  2021-11-08  8:38 ` [PATCH 2/2] pseries/mce: Refactor the pseries mce handling code Ganesh Goudar
@ 2021-11-08 14:19 ` Nicholas Piggin
  2021-11-18  9:51   ` Ganesh
  2021-11-12  7:12 ` Daniel Axtens
  2 siblings, 1 reply; 6+ messages in thread
From: Nicholas Piggin @ 2021-11-08 14:19 UTC (permalink / raw)
  To: Ganesh Goudar, linuxppc-dev, mpe; +Cc: mahesh

Excerpts from Ganesh Goudar's message of November 8, 2021 6:38 pm:
> In realmode mce handler we use irq_work_queue() to defer
> the processing of mce events, irq_work_queue() can only
> be called when translation is enabled because it touches
> memory outside RMA, hence we enable translation before
> calling irq_work_queue and disable on return, though it
> is not safe to do in realmode.
> 
> To avoid this, program the decrementer and call the event
> processing functions from timer handler.
> 
> Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/machdep.h       |  2 +
>  arch/powerpc/include/asm/mce.h           |  2 +
>  arch/powerpc/include/asm/paca.h          |  1 +
>  arch/powerpc/kernel/mce.c                | 51 +++++++++++-------------
>  arch/powerpc/kernel/time.c               |  3 ++
>  arch/powerpc/platforms/pseries/pseries.h |  1 +
>  arch/powerpc/platforms/pseries/ras.c     | 31 +-------------
>  arch/powerpc/platforms/pseries/setup.c   |  1 +
>  8 files changed, 34 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
> index 764f2732a821..c89cc03c0f97 100644
> --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -103,6 +103,8 @@ struct machdep_calls {
>  	/* Called during machine check exception to retrive fixup address. */
>  	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
>  
> +	void            (*machine_check_log_err)(void);
> +
>  	/* Motherboard/chipset features. This is a kind of general purpose
>  	 * hook used to control some machine specific features (like reset
>  	 * lines, chip power control, etc...).
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 331d944280b8..187810f13669 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
>  unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>  extern void mce_common_process_ue(struct pt_regs *regs,
>  				  struct mce_error_info *mce_err);
> +extern void machine_check_raise_dec_intr(void);

No new externs on function declarations, they tell me.

>  int mce_register_notifier(struct notifier_block *nb);
>  int mce_unregister_notifier(struct notifier_block *nb);
> +void mce_run_late_handlers(void);
>  #ifdef CONFIG_PPC_BOOK3S_64
>  void flush_and_reload_slb(void);
>  void flush_erat(void);
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index dc05a862e72a..f49180f8c9be 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -280,6 +280,7 @@ struct paca_struct {
>  #endif
>  #ifdef CONFIG_PPC_BOOK3S_64
>  	struct mce_info *mce_info;
> +	atomic_t mces_to_process;
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>  } ____cacheline_aligned;
>  
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index fd829f7f25a4..45baa062ebc0 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -28,19 +28,9 @@
>  
>  #include "setup.h"
>  
> -static void machine_check_process_queued_event(struct irq_work *work);
> -static void machine_check_ue_irq_work(struct irq_work *work);
>  static void machine_check_ue_event(struct machine_check_event *evt);
>  static void machine_process_ue_event(struct work_struct *work);
>  
> -static struct irq_work mce_event_process_work = {
> -        .func = machine_check_process_queued_event,
> -};
> -
> -static struct irq_work mce_ue_event_irq_work = {
> -	.func = machine_check_ue_irq_work,
> -};
> -
>  static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
>  
>  static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
> @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce,
>  	}
>  }
>  
> +/* Raise decrementer interrupt */
> +void machine_check_raise_dec_intr(void)
> +{
> +	set_dec(1);
> +}

The problem here is a timer can be scheduled (e.g., by an external 
interrupt if it gets taken before the decrementer, then uses a
timer) and that set decr > 1. See logic in decrementer_set_next_event.

I _think_ the way to get around this would be to have the machine check
just use arch_irq_work_raise.

Then you could also only call the mce handler inside the
test_irq_work_pending() check and avoid the added function call on every 
timer. That test should also be marked unlikely come to think of it, but
that's a side patchlet.

> +
>  /*
>   * Decode and save high level MCE information into per cpu buffer which
>   * is an array of machine_check_event structure.
> @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
>  	if (mce->error_type == MCE_ERROR_TYPE_UE)
>  		mce->u.ue_error.ignore_event = mce_err->ignore_event;
>  
> +	atomic_inc(&local_paca->mces_to_process);
> +
>  	if (!addr)
>  		return;
>  
> @@ -217,7 +215,7 @@ void release_mce_event(void)
>  	get_mce_event(NULL, true);
>  }
>  
> -static void machine_check_ue_irq_work(struct irq_work *work)
> +static void machine_check_ue_work(void)
>  {
>  	schedule_work(&mce_ue_event_work);
>  }
> @@ -239,7 +237,7 @@ static void machine_check_ue_event(struct machine_check_event *evt)
>  	       evt, sizeof(*evt));
>  
>  	/* Queue work to process this event later. */
> -	irq_work_queue(&mce_ue_event_irq_work);
> +	machine_check_raise_dec_intr();
>  }
>  
>  /*
> @@ -249,7 +247,6 @@ void machine_check_queue_event(void)
>  {
>  	int index;
>  	struct machine_check_event evt;
> -	unsigned long msr;
>  
>  	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
>  		return;
> @@ -263,20 +260,7 @@ void machine_check_queue_event(void)
>  	memcpy(&local_paca->mce_info->mce_event_queue[index],
>  	       &evt, sizeof(evt));
>  
> -	/*
> -	 * Queue irq work to process this event later. Before
> -	 * queuing the work enable translation for non radix LPAR,
> -	 * as irq_work_queue may try to access memory outside RMO
> -	 * region.
> -	 */
> -	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
> -		msr = mfmsr();
> -		mtmsr(msr | MSR_IR | MSR_DR);
> -		irq_work_queue(&mce_event_process_work);
> -		mtmsr(msr);
> -	} else {
> -		irq_work_queue(&mce_event_process_work);
> -	}

Getting rid of these things would be very nice.

> +	machine_check_raise_dec_intr();
>  }
>  
>  void mce_common_process_ue(struct pt_regs *regs,
> @@ -338,7 +322,7 @@ static void machine_process_ue_event(struct work_struct *work)
>   * process pending MCE event from the mce event queue. This function will be
>   * called during syscall exit.
>   */
> -static void machine_check_process_queued_event(struct irq_work *work)
> +static void machine_check_process_queued_event(void)
>  {
>  	int index;
>  	struct machine_check_event *evt;
> @@ -363,6 +347,17 @@ static void machine_check_process_queued_event(struct irq_work *work)
>  	}
>  }
>  
> +void mce_run_late_handlers(void)
> +{
> +	if (unlikely(atomic_read(&local_paca->mces_to_process))) {
> +		if (ppc_md.machine_check_log_err)
> +			ppc_md.machine_check_log_err();
> +		machine_check_process_queued_event();
> +		machine_check_ue_work();
> +		atomic_dec(&local_paca->mces_to_process);
> +	}
> +}
> +
>  void machine_check_print_event_info(struct machine_check_event *evt,
>  				    bool user_mode, bool in_guest)
>  {
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 934d8ae66cc6..2dc09d75d77c 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>  		irq_work_run();
>  	}
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	mce_run_late_handlers();
> +#endif

It looks like if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) should work here?

>  	now = get_tb();
>  	if (now >= *next_tb) {
>  		*next_tb = ~(u64)0;
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 3544778e06d0..0dc4f1027b30 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -21,6 +21,7 @@ struct pt_regs;
>  extern int pSeries_system_reset_exception(struct pt_regs *regs);
>  extern int pSeries_machine_check_exception(struct pt_regs *regs);
>  extern long pseries_machine_check_realmode(struct pt_regs *regs);
> +extern void pSeries_machine_check_log_err(void);

extern can be removed.

Thanks,
Nick

>  
>  #ifdef CONFIG_SMP
>  extern void smp_init_pseries(void);
> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
> index 56092dccfdb8..8613f9cc5798 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock);
>  
>  static int ras_check_exception_token;
>  
> -static void mce_process_errlog_event(struct irq_work *work);
> -static struct irq_work mce_errlog_process_work = {
> -	.func = mce_process_errlog_event,
> -};
> -
>  #define EPOW_SENSOR_TOKEN	9
>  #define EPOW_SENSOR_INDEX	0
>  
> @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>  	error_type = mce_log->error_type;
>  
>  	disposition = mce_handle_err_realmode(disposition, error_type);
> -
> -	/*
> -	 * Enable translation as we will be accessing per-cpu variables
> -	 * in save_mce_event() which may fall outside RMO region, also
> -	 * leave it enabled because subsequently we will be queuing work
> -	 * to workqueues where again per-cpu variables accessed, besides
> -	 * fwnmi_release_errinfo() crashes when called in realmode on
> -	 * pseries.
> -	 * Note: All the realmode handling like flushing SLB entries for
> -	 *       SLB multihit is done by now.
> -	 */
>  out:
> -	msr = mfmsr();
> -	mtmsr(msr | MSR_IR | MSR_DR);
> -
>  	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>  					      disposition);
> -
> -	/*
> -	 * Queue irq work to log this rtas event later.
> -	 * irq_work_queue uses per-cpu variables, so do this in virt
> -	 * mode as well.
> -	 */
> -	irq_work_queue(&mce_errlog_process_work);
> -
> -	mtmsr(msr);
> -
>  	return disposition;
>  }
>  
>  /*
>   * Process MCE rtas errlog event.
>   */
> -static void mce_process_errlog_event(struct irq_work *work)
> +void pSeries_machine_check_log_err(void)
>  {
>  	struct rtas_error_log *err;
>  
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index f79126f16258..54bd7bdb7e92 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -1085,6 +1085,7 @@ define_machine(pseries) {
>  	.system_reset_exception = pSeries_system_reset_exception,
>  	.machine_check_early	= pseries_machine_check_realmode,
>  	.machine_check_exception = pSeries_machine_check_exception,
> +	.machine_check_log_err	= pSeries_machine_check_log_err,
>  #ifdef CONFIG_KEXEC_CORE
>  	.machine_kexec          = pSeries_machine_kexec,
>  	.kexec_cpu_down         = pseries_kexec_cpu_down,
> -- 
> 2.26.2
> 
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
  2021-11-08  8:38 [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Ganesh Goudar
  2021-11-08  8:38 ` [PATCH 2/2] pseries/mce: Refactor the pseries mce handling code Ganesh Goudar
  2021-11-08 14:19 ` [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Nicholas Piggin
@ 2021-11-12  7:12 ` Daniel Axtens
  2021-11-18  9:44   ` Ganesh
  2 siblings, 1 reply; 6+ messages in thread
From: Daniel Axtens @ 2021-11-12  7:12 UTC (permalink / raw)
  To: Ganesh Goudar, linuxppc-dev, mpe; +Cc: Ganesh Goudar, mahesh, npiggin

Hi Ganesh,

> In realmode mce handler we use irq_work_queue() to defer
> the processing of mce events, irq_work_queue() can only
> be called when translation is enabled because it touches
> memory outside RMA, hence we enable translation before
> calling irq_work_queue and disable on return, though it
> is not safe to do in realmode.
>
> To avoid this, program the decrementer and call the event
> processing functions from timer handler.
>

This is an interesting approach, and it would indeed be nice to clear up
the MCE handling a bit.

I have a few questions:

> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 331d944280b8..187810f13669 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
>  unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>  extern void mce_common_process_ue(struct pt_regs *regs,
>  				  struct mce_error_info *mce_err);
> +extern void machine_check_raise_dec_intr(void);

Does this need an extern? I think that's the default...?

>  int mce_register_notifier(struct notifier_block *nb);
>  int mce_unregister_notifier(struct notifier_block *nb);
> +void mce_run_late_handlers(void);
>  #ifdef CONFIG_PPC_BOOK3S_64
>  void flush_and_reload_slb(void);
>  void flush_erat(void);
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index dc05a862e72a..f49180f8c9be 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -280,6 +280,7 @@ struct paca_struct {
>  #endif
>  #ifdef CONFIG_PPC_BOOK3S_64
>  	struct mce_info *mce_info;
> +	atomic_t mces_to_process;

This is in the PACA, which is supposed to be a per-cpu structure: hey
does it need to be atomic_t? Isn't there only one CPU accessing it?

Does this variable provide anything new compared to mce_nest_count or
mce_queue_count + mce_ue_count? It looks like
machine_check_process_queued_event will clear a queue based on value of
mce_queue_count and machine_check_process_ue_event will clear a queue
based on mce_ue_count...

I think (absent nested interrupts which I talk about below) it should be
the case that mces_to_process == mce_queue_count + mce_ue_count but I
might be wrong?

>  #endif /* CONFIG_PPC_BOOK3S_64 */
>  } ____cacheline_aligned;

  
>  /*
>   * Decode and save high level MCE information into per cpu buffer which
>   * is an array of machine_check_event structure.
> @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
>  	if (mce->error_type == MCE_ERROR_TYPE_UE)
>  		mce->u.ue_error.ignore_event = mce_err->ignore_event;
>  
> +	atomic_inc(&local_paca->mces_to_process);
> +

Is there any chance the decrementer will fire between when you do this
atomic_inc() and when you finish adding all the information to the mce
data structure in the remainder of save_mce_event? (e.g. filling in the
tlb_errror.effective_address field)?

(Or does save_mce_event get called with interrupts masked? I find it
very hard to remember what parts of the MCE code path happen under what
circumstances!)

>  	if (!addr)
>  		return;
>  


> @@ -338,7 +322,7 @@ static void machine_process_ue_event(struct work_struct *work)
>   * process pending MCE event from the mce event queue. This function will be
>   * called during syscall exit.

Is this comment still accurate if this patch is applied?

>   */
> -static void machine_check_process_queued_event(struct irq_work *work)
> +static void machine_check_process_queued_event(void)
>  {
>  	int index;
>  	struct machine_check_event *evt;
> @@ -363,6 +347,17 @@ static void machine_check_process_queued_event(struct irq_work *work)
>  	}
>  }
>  
> +void mce_run_late_handlers(void)
> +{
> +	if (unlikely(atomic_read(&local_paca->mces_to_process))) {
> +		if (ppc_md.machine_check_log_err)
> +			ppc_md.machine_check_log_err();
> +		machine_check_process_queued_event();
> +		machine_check_ue_work();
> +		atomic_dec(&local_paca->mces_to_process);
> +	}
> +}

What happens if you get a nested MCE between log_err() and
process_queued_event()? If my very foggy memory of the MCE handling is
correct, we enable nested MCEs very early in the process because the
alternative is that a nested MCE will checkstop the box. So I think this
might be possible, albeit probably unlikely.

It looks like process_queued_event clears the entire MCE queue as
determined by the mce_queue_count. So, consider the following sequence
of events:

1. Take MCE 1. Save to queue, increment mce_queue_count, increment
   mces_to_process, set decrementer to fire.

2. Decrementer fires. mce_run_late_handlers is called.

3. mces_to_process = 1, so we call machine_check_log_err(), which prints
   (on pseries) the info for MCE 1.

4. Take MCE 2. This is saved to the queue, mce_queue_count is
   incremented, mces_to_process is incremented, and the decrementer is
   armed again.

5. We then leave the MCE interrupt context and return to the decrementer
   handling context. The next thing we do is we call
   m_c_e_process_queued_event(), which clears the entire queue (that is,
   MCEs 1 and 2):

	while (local_paca->mce_info->mce_queue_count > 0) {
		index = local_paca->mce_info->mce_queue_count - 1;
		evt = &local_paca->mce_info->mce_event_queue[index];

		if (evt->error_type == MCE_ERROR_TYPE_UE &&
		    evt->u.ue_error.ignore_event) {
			local_paca->mce_info->mce_queue_count--;
			continue;
		}
		machine_check_print_event_info(evt, false, false);
		local_paca->mce_info->mce_queue_count--;
	}

 6. We finish mce_run_late_handlers() and decrement mces_to_process,
    so it's now 1.

 7. The decrementer fires again, mces_to_process is 1, so we start
    processing again.

 8. We call machine_check_log_err again, it will now call the FWNMI code
    again and possibly print error 2.

 9. process_queued_event will be called again but mce_queue_count will
    be 0 so it it will bail out early.

I _think_ the worst that can happen - at least so long as pseries is the
only implementaion of machine_check_log_err - is that we will handle
MCE 2 before we query the firmware about it. That's probably benign, but
I am still concerned with the overall interaction around nested
interrupts.

>  void machine_check_print_event_info(struct machine_check_event *evt,
>  				    bool user_mode, bool in_guest)
>  {
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 934d8ae66cc6..2dc09d75d77c 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>  		irq_work_run();
>  	}
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	mce_run_late_handlers();
> +#endif
>
So we're now branching to a function in a different file and doing an
atomic read in every timer interrupt. Is this a hot path? Is there any
speed implication to doing this?

>  	now = get_tb();
>  	if (now >= *next_tb) {
>  		*next_tb = ~(u64)0;


> @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>  	error_type = mce_log->error_type;
>  
>  	disposition = mce_handle_err_realmode(disposition, error_type);
> -
> -	/*
> -	 * Enable translation as we will be accessing per-cpu variables
> -	 * in save_mce_event() which may fall outside RMO region, also
> -	 * leave it enabled because subsequently we will be queuing work
> -	 * to workqueues where again per-cpu variables accessed, besides
> -	 * fwnmi_release_errinfo() crashes when called in realmode on
> -	 * pseries.
> -	 * Note: All the realmode handling like flushing SLB entries for
> -	 *       SLB multihit is done by now.
> -	 */
>  out:
> -	msr = mfmsr();
> -	mtmsr(msr | MSR_IR | MSR_DR);
> -
>  	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>  					      disposition);

Now you are not in virtual mode/translations on when you are calling
mce_handle_err_virtmode(). From the name, I thought that
mce_handle_err_virtmode() would assume that you are in virtual mode?
Does the function assume that? If so is it safe to call it in real mode?
If not, should we rename it as part of this patch?

> -
> -	/*
> -	 * Queue irq work to log this rtas event later.
> -	 * irq_work_queue uses per-cpu variables, so do this in virt
> -	 * mode as well.
> -	 */
> -	irq_work_queue(&mce_errlog_process_work);
> -
> -	mtmsr(msr);
> -
>  	return disposition;
>  }

Kind regards,
Daniel

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
  2021-11-12  7:12 ` Daniel Axtens
@ 2021-11-18  9:44   ` Ganesh
  0 siblings, 0 replies; 6+ messages in thread
From: Ganesh @ 2021-11-18  9:44 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev, mpe; +Cc: mahesh, npiggin

[-- Attachment #1: Type: text/plain, Size: 9814 bytes --]

On 11/12/21 12:42, Daniel Axtens wrote:

>> In realmode mce handler we use irq_work_queue() to defer
>> the processing of mce events, irq_work_queue() can only
>> be called when translation is enabled because it touches
>> memory outside RMA, hence we enable translation before
>> calling irq_work_queue and disable on return, though it
>> is not safe to do in realmode.
>>
>> To avoid this, program the decrementer and call the event
>> processing functions from timer handler.
>>
> This is an interesting approach, and it would indeed be nice to clear up
> the MCE handling a bit.
>
> I have a few questions:
>
>> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
>> index 331d944280b8..187810f13669 100644
>> --- a/arch/powerpc/include/asm/mce.h
>> +++ b/arch/powerpc/include/asm/mce.h
>> @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
>>   unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>>   extern void mce_common_process_ue(struct pt_regs *regs,
>>   				  struct mce_error_info *mce_err);
>> +extern void machine_check_raise_dec_intr(void);
> Does this need an extern? I think that's the default...?

Not required, I will remove it.

>>   int mce_register_notifier(struct notifier_block *nb);
>>   int mce_unregister_notifier(struct notifier_block *nb);
>> +void mce_run_late_handlers(void);
>>   #ifdef CONFIG_PPC_BOOK3S_64
>>   void flush_and_reload_slb(void);
>>   void flush_erat(void);
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index dc05a862e72a..f49180f8c9be 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -280,6 +280,7 @@ struct paca_struct {
>>   #endif
>>   #ifdef CONFIG_PPC_BOOK3S_64
>>   	struct mce_info *mce_info;
>> +	atomic_t mces_to_process;
> This is in the PACA, which is supposed to be a per-cpu structure: hey
> does it need to be atomic_t? Isn't there only one CPU accessing it?

Yes it need not be atomic, got confused with some scenario and
made it atomic.

> Does this variable provide anything new compared to mce_nest_count or
> mce_queue_count + mce_ue_count? It looks like
> machine_check_process_queued_event will clear a queue based on value of
> mce_queue_count and machine_check_process_ue_event will clear a queue
> based on mce_ue_count...
>
> I think (absent nested interrupts which I talk about below) it should be
> the case that mces_to_process == mce_queue_count + mce_ue_count but I
> might be wrong?

If we hit exception in process context, we may not increment mce_queue_count,
so the equation need not be true all time.

>>   #endif /* CONFIG_PPC_BOOK3S_64 */
>>   } ____cacheline_aligned;
>    
>>   /*
>>    * Decode and save high level MCE information into per cpu buffer which
>>    * is an array of machine_check_event structure.
>> @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
>>   	if (mce->error_type == MCE_ERROR_TYPE_UE)
>>   		mce->u.ue_error.ignore_event = mce_err->ignore_event;
>>   
>> +	atomic_inc(&local_paca->mces_to_process);
>> +
> Is there any chance the decrementer will fire between when you do this
> atomic_inc() and when you finish adding all the information to the mce
> data structure in the remainder of save_mce_event? (e.g. filling in the
> tlb_errror.effective_address field)?
>
> (Or does save_mce_event get called with interrupts masked? I find it
> very hard to remember what parts of the MCE code path happen under what
> circumstances!)

Yes, Interrupts will be disabled, I mean MSR[EE]=0 when mce is being handled.

>>   	if (!addr)
>>   		return;
>>   
>
>> @@ -338,7 +322,7 @@ static void machine_process_ue_event(struct work_struct *work)
>>    * process pending MCE event from the mce event queue. This function will be
>>    * called during syscall exit.
> Is this comment still accurate if this patch is applied?

No, mpe has also pointed this out, we will clean it in a different patch.

>
>>    */
>> -static void machine_check_process_queued_event(struct irq_work *work)
>> +static void machine_check_process_queued_event(void)
>>   {
>>   	int index;
>>   	struct machine_check_event *evt;
>> @@ -363,6 +347,17 @@ static void machine_check_process_queued_event(struct irq_work *work)
>>   	}
>>   }
>>   
>> +void mce_run_late_handlers(void)
>> +{
>> +	if (unlikely(atomic_read(&local_paca->mces_to_process))) {
>> +		if (ppc_md.machine_check_log_err)
>> +			ppc_md.machine_check_log_err();
>> +		machine_check_process_queued_event();
>> +		machine_check_ue_work();
>> +		atomic_dec(&local_paca->mces_to_process);
>> +	}
>> +}
> What happens if you get a nested MCE between log_err() and
> process_queued_event()? If my very foggy memory of the MCE handling is
> correct, we enable nested MCEs very early in the process because the
> alternative is that a nested MCE will checkstop the box. So I think this
> might be possible, albeit probably unlikely.
>
> It looks like process_queued_event clears the entire MCE queue as
> determined by the mce_queue_count. So, consider the following sequence
> of events:
>
> 1. Take MCE 1. Save to queue, increment mce_queue_count, increment
>     mces_to_process, set decrementer to fire.
>
> 2. Decrementer fires. mce_run_late_handlers is called.
>
> 3. mces_to_process = 1, so we call machine_check_log_err(), which prints
>     (on pseries) the info for MCE 1.
>
> 4. Take MCE 2. This is saved to the queue, mce_queue_count is
>     incremented, mces_to_process is incremented, and the decrementer is
>     armed again.
>
> 5. We then leave the MCE interrupt context and return to the decrementer
>     handling context. The next thing we do is we call
>     m_c_e_process_queued_event(), which clears the entire queue (that is,
>     MCEs 1 and 2):
>
> 	while (local_paca->mce_info->mce_queue_count > 0) {
> 		index = local_paca->mce_info->mce_queue_count - 1;
> 		evt = &local_paca->mce_info->mce_event_queue[index];
>
> 		if (evt->error_type == MCE_ERROR_TYPE_UE &&
> 		    evt->u.ue_error.ignore_event) {
> 			local_paca->mce_info->mce_queue_count--;
> 			continue;
> 		}
> 		machine_check_print_event_info(evt, false, false);
> 		local_paca->mce_info->mce_queue_count--;
> 	}
>
>   6. We finish mce_run_late_handlers() and decrement mces_to_process,
>      so it's now 1.
>
>   7. The decrementer fires again, mces_to_process is 1, so we start
>      processing again.
>
>   8. We call machine_check_log_err again, it will now call the FWNMI code
>      again and possibly print error 2.
>
>   9. process_queued_event will be called again but mce_queue_count will
>      be 0 so it it will bail out early.
>
> I _think_ the worst that can happen - at least so long as pseries is the
> only implementaion of machine_check_log_err - is that we will handle
> MCE 2 before we query the firmware about it. That's probably benign, but
> I am still concerned with the overall interaction around nested
> interrupts.

The only problem we have here is overwriting mce_data_buf in case of nested
mce, and about "handle MCE 2 before we query the firmware about it" It is not
possible, isn't it?

Assume we take MCE 2 while we are in the middle of mce_run_late_handlers(),
before the MCE handler relinquishes the CPU to timer handler, we will have
everything in place, right? or am I missing something obvious.

>>   void machine_check_print_event_info(struct machine_check_event *evt,
>>   				    bool user_mode, bool in_guest)
>>   {
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index 934d8ae66cc6..2dc09d75d77c 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>>   		irq_work_run();
>>   	}
>>   
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +	mce_run_late_handlers();
>> +#endif
>>
> So we're now branching to a function in a different file and doing an
> atomic read in every timer interrupt. Is this a hot path? Is there any
> speed implication to doing this?

Nick has suggested me to use test_irq_work_pending() and I will remove the
atomic read, with v2 we may not have any serious time implications.

>>   	now = get_tb();
>>   	if (now >= *next_tb) {
>>   		*next_tb = ~(u64)0;
>
>> @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>>   	error_type = mce_log->error_type;
>>   
>>   	disposition = mce_handle_err_realmode(disposition, error_type);
>> -
>> -	/*
>> -	 * Enable translation as we will be accessing per-cpu variables
>> -	 * in save_mce_event() which may fall outside RMO region, also
>> -	 * leave it enabled because subsequently we will be queuing work
>> -	 * to workqueues where again per-cpu variables accessed, besides
>> -	 * fwnmi_release_errinfo() crashes when called in realmode on
>> -	 * pseries.
>> -	 * Note: All the realmode handling like flushing SLB entries for
>> -	 *       SLB multihit is done by now.
>> -	 */
>>   out:
>> -	msr = mfmsr();
>> -	mtmsr(msr | MSR_IR | MSR_DR);
>> -
>>   	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>>   					      disposition);
> Now you are not in virtual mode/translations on when you are calling
> mce_handle_err_virtmode(). From the name, I thought that
> mce_handle_err_virtmode() would assume that you are in virtual mode?
> Does the function assume that? If so is it safe to call it in real mode?
> If not, should we rename it as part of this patch?

patch 2/2, refactors this.

>> -
>> -	/*
>> -	 * Queue irq work to log this rtas event later.
>> -	 * irq_work_queue uses per-cpu variables, so do this in virt
>> -	 * mode as well.
>> -	 */
>> -	irq_work_queue(&mce_errlog_process_work);
>> -
>> -	mtmsr(msr);
>> -
>>   	return disposition;
>>   }

Thanks for the review :) .
Ganesh

[-- Attachment #2: Type: text/html, Size: 12699 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode
  2021-11-08 14:19 ` [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Nicholas Piggin
@ 2021-11-18  9:51   ` Ganesh
  0 siblings, 0 replies; 6+ messages in thread
From: Ganesh @ 2021-11-18  9:51 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev, mpe; +Cc: mahesh

[-- Attachment #1: Type: text/plain, Size: 11637 bytes --]

On 11/8/21 19:49, Nicholas Piggin wrote:

> Excerpts from Ganesh Goudar's message of November 8, 2021 6:38 pm:
>> In realmode mce handler we use irq_work_queue() to defer
>> the processing of mce events, irq_work_queue() can only
>> be called when translation is enabled because it touches
>> memory outside RMA, hence we enable translation before
>> calling irq_work_queue and disable on return, though it
>> is not safe to do in realmode.
>>
>> To avoid this, program the decrementer and call the event
>> processing functions from timer handler.
>>
>> Signed-off-by: Ganesh Goudar<ganeshgr@linux.ibm.com>
>> ---
>>   arch/powerpc/include/asm/machdep.h       |  2 +
>>   arch/powerpc/include/asm/mce.h           |  2 +
>>   arch/powerpc/include/asm/paca.h          |  1 +
>>   arch/powerpc/kernel/mce.c                | 51 +++++++++++-------------
>>   arch/powerpc/kernel/time.c               |  3 ++
>>   arch/powerpc/platforms/pseries/pseries.h |  1 +
>>   arch/powerpc/platforms/pseries/ras.c     | 31 +-------------
>>   arch/powerpc/platforms/pseries/setup.c   |  1 +
>>   8 files changed, 34 insertions(+), 58 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
>> index 764f2732a821..c89cc03c0f97 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -103,6 +103,8 @@ struct machdep_calls {
>>   	/* Called during machine check exception to retrive fixup address. */
>>   	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
>>   
>> +	void            (*machine_check_log_err)(void);
>> +
>>   	/* Motherboard/chipset features. This is a kind of general purpose
>>   	 * hook used to control some machine specific features (like reset
>>   	 * lines, chip power control, etc...).
>> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
>> index 331d944280b8..187810f13669 100644
>> --- a/arch/powerpc/include/asm/mce.h
>> +++ b/arch/powerpc/include/asm/mce.h
>> @@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
>>   unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>>   extern void mce_common_process_ue(struct pt_regs *regs,
>>   				  struct mce_error_info *mce_err);
>> +extern void machine_check_raise_dec_intr(void);
> No new externs on function declarations, they tell me.

ok.

>>   int mce_register_notifier(struct notifier_block *nb);
>>   int mce_unregister_notifier(struct notifier_block *nb);
>> +void mce_run_late_handlers(void);
>>   #ifdef CONFIG_PPC_BOOK3S_64
>>   void flush_and_reload_slb(void);
>>   void flush_erat(void);
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index dc05a862e72a..f49180f8c9be 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -280,6 +280,7 @@ struct paca_struct {
>>   #endif
>>   #ifdef CONFIG_PPC_BOOK3S_64
>>   	struct mce_info *mce_info;
>> +	atomic_t mces_to_process;
>>   #endif /* CONFIG_PPC_BOOK3S_64 */
>>   } ____cacheline_aligned;
>>   
>> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
>> index fd829f7f25a4..45baa062ebc0 100644
>> --- a/arch/powerpc/kernel/mce.c
>> +++ b/arch/powerpc/kernel/mce.c
>> @@ -28,19 +28,9 @@
>>   
>>   #include "setup.h"
>>   
>> -static void machine_check_process_queued_event(struct irq_work *work);
>> -static void machine_check_ue_irq_work(struct irq_work *work);
>>   static void machine_check_ue_event(struct machine_check_event *evt);
>>   static void machine_process_ue_event(struct work_struct *work);
>>   
>> -static struct irq_work mce_event_process_work = {
>> -        .func = machine_check_process_queued_event,
>> -};
>> -
>> -static struct irq_work mce_ue_event_irq_work = {
>> -	.func = machine_check_ue_irq_work,
>> -};
>> -
>>   static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
>>   
>>   static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
>> @@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event *mce,
>>   	}
>>   }
>>   
>> +/* Raise decrementer interrupt */
>> +void machine_check_raise_dec_intr(void)
>> +{
>> +	set_dec(1);
>> +}
> The problem here is a timer can be scheduled (e.g., by an external
> interrupt if it gets taken before the decrementer, then uses a
> timer) and that set decr > 1. See logic in decrementer_set_next_event.
>
> I _think_ the way to get around this would be to have the machine check
> just use arch_irq_work_raise.
>
> Then you could also only call the mce handler inside the
> test_irq_work_pending() check and avoid the added function call on every
> timer. That test should also be marked unlikely come to think of it, but
> that's a side patchlet.

Sure, I will use arch_irq_work_raise() and test_irq_work_pending().

>
>> +
>>   /*
>>    * Decode and save high level MCE information into per cpu buffer which
>>    * is an array of machine_check_event structure.
>> @@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
>>   	if (mce->error_type == MCE_ERROR_TYPE_UE)
>>   		mce->u.ue_error.ignore_event = mce_err->ignore_event;
>>   
>> +	atomic_inc(&local_paca->mces_to_process);
>> +
>>   	if (!addr)
>>   		return;
>>   
>> @@ -217,7 +215,7 @@ void release_mce_event(void)
>>   	get_mce_event(NULL, true);
>>   }
>>   
>> -static void machine_check_ue_irq_work(struct irq_work *work)
>> +static void machine_check_ue_work(void)
>>   {
>>   	schedule_work(&mce_ue_event_work);
>>   }
>> @@ -239,7 +237,7 @@ static void machine_check_ue_event(struct machine_check_event *evt)
>>   	       evt, sizeof(*evt));
>>   
>>   	/* Queue work to process this event later. */
>> -	irq_work_queue(&mce_ue_event_irq_work);
>> +	machine_check_raise_dec_intr();
>>   }
>>   
>>   /*
>> @@ -249,7 +247,6 @@ void machine_check_queue_event(void)
>>   {
>>   	int index;
>>   	struct machine_check_event evt;
>> -	unsigned long msr;
>>   
>>   	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
>>   		return;
>> @@ -263,20 +260,7 @@ void machine_check_queue_event(void)
>>   	memcpy(&local_paca->mce_info->mce_event_queue[index],
>>   	       &evt, sizeof(evt));
>>   
>> -	/*
>> -	 * Queue irq work to process this event later. Before
>> -	 * queuing the work enable translation for non radix LPAR,
>> -	 * as irq_work_queue may try to access memory outside RMO
>> -	 * region.
>> -	 */
>> -	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
>> -		msr = mfmsr();
>> -		mtmsr(msr | MSR_IR | MSR_DR);
>> -		irq_work_queue(&mce_event_process_work);
>> -		mtmsr(msr);
>> -	} else {
>> -		irq_work_queue(&mce_event_process_work);
>> -	}
> Getting rid of these things would be very nice.
>
>> +	machine_check_raise_dec_intr();
>>   }
>>   
>>   void mce_common_process_ue(struct pt_regs *regs,
>> @@ -338,7 +322,7 @@ static void machine_process_ue_event(struct work_struct *work)
>>    * process pending MCE event from the mce event queue. This function will be
>>    * called during syscall exit.
>>    */
>> -static void machine_check_process_queued_event(struct irq_work *work)
>> +static void machine_check_process_queued_event(void)
>>   {
>>   	int index;
>>   	struct machine_check_event *evt;
>> @@ -363,6 +347,17 @@ static void machine_check_process_queued_event(struct irq_work *work)
>>   	}
>>   }
>>   
>> +void mce_run_late_handlers(void)
>> +{
>> +	if (unlikely(atomic_read(&local_paca->mces_to_process))) {
>> +		if (ppc_md.machine_check_log_err)
>> +			ppc_md.machine_check_log_err();
>> +		machine_check_process_queued_event();
>> +		machine_check_ue_work();
>> +		atomic_dec(&local_paca->mces_to_process);
>> +	}
>> +}
>> +
>>   void machine_check_print_event_info(struct machine_check_event *evt,
>>   				    bool user_mode, bool in_guest)
>>   {
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index 934d8ae66cc6..2dc09d75d77c 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -597,6 +597,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>>   		irq_work_run();
>>   	}
>>   
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +	mce_run_late_handlers();
>> +#endif
> It looks like if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) should work here?

sure.

>>   	now = get_tb();
>>   	if (now >= *next_tb) {
>>   		*next_tb = ~(u64)0;
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 3544778e06d0..0dc4f1027b30 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -21,6 +21,7 @@ struct pt_regs;
>>   extern int pSeries_system_reset_exception(struct pt_regs *regs);
>>   extern int pSeries_machine_check_exception(struct pt_regs *regs);
>>   extern long pseries_machine_check_realmode(struct pt_regs *regs);
>> +extern void pSeries_machine_check_log_err(void);
> extern can be removed.

sure, thanks.

> Thanks,
> Nick
>
>>   
>>   #ifdef CONFIG_SMP
>>   extern void smp_init_pseries(void);
>> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
>> index 56092dccfdb8..8613f9cc5798 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -23,11 +23,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock);
>>   
>>   static int ras_check_exception_token;
>>   
>> -static void mce_process_errlog_event(struct irq_work *work);
>> -static struct irq_work mce_errlog_process_work = {
>> -	.func = mce_process_errlog_event,
>> -};
>> -
>>   #define EPOW_SENSOR_TOKEN	9
>>   #define EPOW_SENSOR_INDEX	0
>>   
>> @@ -729,40 +724,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>>   	error_type = mce_log->error_type;
>>   
>>   	disposition = mce_handle_err_realmode(disposition, error_type);
>> -
>> -	/*
>> -	 * Enable translation as we will be accessing per-cpu variables
>> -	 * in save_mce_event() which may fall outside RMO region, also
>> -	 * leave it enabled because subsequently we will be queuing work
>> -	 * to workqueues where again per-cpu variables accessed, besides
>> -	 * fwnmi_release_errinfo() crashes when called in realmode on
>> -	 * pseries.
>> -	 * Note: All the realmode handling like flushing SLB entries for
>> -	 *       SLB multihit is done by now.
>> -	 */
>>   out:
>> -	msr = mfmsr();
>> -	mtmsr(msr | MSR_IR | MSR_DR);
>> -
>>   	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>>   					      disposition);
>> -
>> -	/*
>> -	 * Queue irq work to log this rtas event later.
>> -	 * irq_work_queue uses per-cpu variables, so do this in virt
>> -	 * mode as well.
>> -	 */
>> -	irq_work_queue(&mce_errlog_process_work);
>> -
>> -	mtmsr(msr);
>> -
>>   	return disposition;
>>   }
>>   
>>   /*
>>    * Process MCE rtas errlog event.
>>    */
>> -static void mce_process_errlog_event(struct irq_work *work)
>> +void pSeries_machine_check_log_err(void)
>>   {
>>   	struct rtas_error_log *err;
>>   
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index f79126f16258..54bd7bdb7e92 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -1085,6 +1085,7 @@ define_machine(pseries) {
>>   	.system_reset_exception = pSeries_system_reset_exception,
>>   	.machine_check_early	= pseries_machine_check_realmode,
>>   	.machine_check_exception = pSeries_machine_check_exception,
>> +	.machine_check_log_err	= pSeries_machine_check_log_err,
>>   #ifdef CONFIG_KEXEC_CORE
>>   	.machine_kexec          = pSeries_machine_kexec,
>>   	.kexec_cpu_down         = pseries_kexec_cpu_down,
>> -- 
>> 2.26.2
>>
>>

[-- Attachment #2: Type: text/html, Size: 12603 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-11-18 10:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-08  8:38 [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Ganesh Goudar
2021-11-08  8:38 ` [PATCH 2/2] pseries/mce: Refactor the pseries mce handling code Ganesh Goudar
2021-11-08 14:19 ` [PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode Nicholas Piggin
2021-11-18  9:51   ` Ganesh
2021-11-12  7:12 ` Daniel Axtens
2021-11-18  9:44   ` Ganesh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).