linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/2] NMI IPI work in progress for Linux and OPAL
@ 2017-09-12 16:05 Nicholas Piggin
  2017-09-12 16:05 ` [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms Nicholas Piggin
  2017-09-12 16:05 ` [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET Nicholas Piggin
  0 siblings, 2 replies; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-12 16:05 UTC (permalink / raw)
  To: linuxppc-dev, skiboot
  Cc: Nicholas Piggin, Benjamin Herrenschmidt, Alistair Popple

Hi,

I have Linux and OPAL patches that make the NMI IPI facility work on
a POWER9 DD1 here, lightly tested. It works for threads that are running
and are in stop (at least the stop0_lite state enabled in DD1).

Comments on the OPAL patch in particular would be good. Next step will
be more testing with DD2 and different idle states.

This was taken mostly from pdbg but I have ended up changing a few
things (e.g., to make it work in idle state).

Thanks,
Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms
  2017-09-12 16:05 [RFC PATCH 0/2] NMI IPI work in progress for Linux and OPAL Nicholas Piggin
@ 2017-09-12 16:05 ` Nicholas Piggin
  2017-09-12 23:18   ` Benjamin Herrenschmidt
  2017-09-12 16:05 ` [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET Nicholas Piggin
  1 sibling, 1 reply; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-12 16:05 UTC (permalink / raw)
  To: linuxppc-dev, skiboot
  Cc: Nicholas Piggin, Benjamin Herrenschmidt, Alistair Popple

This implements a way to raise system reset interrupts on other
cores. This has not yet been tested on DD2 or with deeper sleep
states.
---
 core/Makefile.inc       |   1 +
 core/sreset.c           | 237 ++++++++++++++++++++++++++++++++++++++++++++++++
 hw/xscom.c              |   2 +
 include/skiboot.h       |   3 +
 platforms/mambo/mambo.c |   3 +-
 5 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 core/sreset.c

diff --git a/core/Makefile.inc b/core/Makefile.inc
index f2de2f64..16204978 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -9,6 +9,7 @@ CORE_OBJS += vpd.o hostservices.o platform.o nvram.o nvram-format.o hmi.o
 CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
 CORE_OBJS += timer.o i2c.o rtc.o flash.o sensor.o ipmi-opal.o
 CORE_OBJS += flash-subpartition.o bitmap.o buddy.o pci-quirk.o powercap.o psr.o
+CORE_OBJS += sreset.o
 
 ifeq ($(SKIBOOT_GCOV),1)
 CORE_OBJS += gcov-profiling.o
diff --git a/core/sreset.c b/core/sreset.c
new file mode 100644
index 00000000..ff20fe71
--- /dev/null
+++ b/core/sreset.c
@@ -0,0 +1,237 @@
+/* Copyright 2017 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <fsp.h>
+#include <psi.h>
+#include <opal.h>
+#include <xscom.h>
+#include <interrupts.h>
+#include <cec.h>
+#include <timebase.h>
+#include <pci.h>
+#include <chip.h>
+#include <chiptod.h>
+#include <ipmi.h>
+
+#define P9_RAS_STATUS			0x10a02
+#define P9_RSTAT_QUIESCED(t)		PPC_BITMASK(0 + 8*(t), 3 + 8*(t))
+#define P9_RAS_MODEREG			0x10a9d
+#define P9_DIRECT_CONTROLS		0x10a9c
+#define P9_DCTL_STOP(t)			PPC_BIT(7 + 8*(t))
+#define P9_DCTL_CONT(t)			PPC_BIT(6 + 8*(t))
+#define P9_DCTL_SRESET(t)		PPC_BIT(4 + 8*(t))
+#define P9_DCTL_PWR(t)			PPC_BIT(32 + 8*(t))
+
+#define P9_CORE_THREAD_STATE		0x10ab3
+#define P9_CTS_STOP(t)			PPC_BIT(56 + (t))
+
+#define PPM_GPMMR			0xf0100
+#define PPM_SPWKUP_OTR			0xf010a
+#define SPECIAL_WKUP_DONE		PPC_BIT(1)
+
+
+static int core_set_special_wakeup(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t swake_addr;
+	uint32_t gpmmr_addr;
+	uint64_t val;
+	int i;
+
+	swake_addr = XSCOM_ADDR_P9_EC(core_id, PPM_SPWKUP_OTR);
+	gpmmr_addr = XSCOM_ADDR_P9_EC(core_id, PPM_GPMMR);
+
+	xscom_read(chip_id, swake_addr, &val);
+	if (xscom_write(chip_id, swake_addr, PPC_BIT(0))) {
+		prlog(PR_WARNING, "SRESET: Unable to write SPWKUP_OTR register\n");
+		return OPAL_HARDWARE;
+	}
+	xscom_read(chip_id, swake_addr, &val);
+
+	for (i = 0; i < 100; i++) {
+		if (xscom_read(chip_id, gpmmr_addr, &val)) {
+			prlog(PR_WARNING, "SRESET: Unable to read GPMMR register\n");
+			return OPAL_HARDWARE;
+		}
+		if (val & SPECIAL_WKUP_DONE)
+			return 0;
+
+		time_wait_us(1);
+	}
+
+	xscom_read(chip_id, swake_addr, &val);
+	xscom_write(chip_id, swake_addr, 0);
+	xscom_read(chip_id, swake_addr, &val);
+
+	prlog(PR_WARNING, "SRESET: Special wakeup mode could not be set.\n");
+	return OPAL_HARDWARE;
+}
+
+static void core_clear_special_wakeup(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t swake_addr;
+	uint64_t val;
+
+	swake_addr = XSCOM_ADDR_P9_EC(core_id, PPM_SPWKUP_OTR);
+
+	/* De-assert special wakeup bit */
+	xscom_read(chip_id, swake_addr, &val);
+	xscom_write(chip_id, swake_addr, 0);
+	xscom_read(chip_id, swake_addr, &val);
+}
+
+static int thread_quiesced(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t ras_addr;
+	uint64_t ras_status;
+
+	ras_addr = XSCOM_ADDR_P9_EC(core_id, P9_RAS_STATUS);
+	if (xscom_read(chip_id, ras_addr, &ras_status)) {
+		prlog(PR_WARNING, "SRESET: Unable to read status register\n");
+		return OPAL_HARDWARE;
+	}
+
+	if ((ras_status & P9_RSTAT_QUIESCED(thread_id))
+		   	== P9_RSTAT_QUIESCED(thread_id))
+		return 1;
+
+	return 0;
+}
+
+static int stop_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t dctl_addr;
+	int i;
+
+	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_DIRECT_CONTROLS);
+
+	xscom_write(chip_id, dctl_addr, P9_DCTL_STOP(thread_id));
+
+	for (i = 0; i < 100; i++) {
+		int rc = thread_quiesced(cpu);
+		if (rc < 0)
+			break;
+		if (rc)
+			return 0;
+	}
+
+	xscom_write(chip_id, dctl_addr, P9_DCTL_CONT(thread_id));
+	prlog(PR_WARNING, "SRESET: Could not quiesce thread\n");
+	return OPAL_HARDWARE;
+}
+
+static int sreset_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t dctl_addr;
+	uint32_t cts_addr;
+	uint64_t cts_val;
+
+	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_DIRECT_CONTROLS);
+	cts_addr = XSCOM_ADDR_P9_EC(core_id, P9_CORE_THREAD_STATE);
+
+	if (xscom_read(chip_id, cts_addr, &cts_val)) {
+		prlog(PR_WARNING, "SRESET: Unable to read CORE_THREAD_STATE register\n");
+		return OPAL_HARDWARE;
+	}
+	if (!(cts_val & P9_CTS_STOP(thread_id))) {
+		/* Clear SRR1[46:47] */
+		if (xscom_write(chip_id, dctl_addr, P9_DCTL_PWR(thread_id))) {
+			prlog(PR_WARNING, "SRESET: Unable to set power saving mode\n");
+			return OPAL_HARDWARE;
+		}
+	}
+
+	if (xscom_write(chip_id, dctl_addr, P9_DCTL_SRESET(thread_id))) {
+		prlog(PR_WARNING, "SRESET: Unable to write DIRECT_CONTROLS register\n");
+		return OPAL_HARDWARE;
+	}
+
+	return 0;
+}
+
+// static struct lock sreset_lock = LOCK_UNLOCKED;
+
+static int64_t sreset_cpu(struct cpu_thread *cpu)
+{
+	int rc;
+
+	if (this_cpu() == cpu) {
+		prlog(PR_WARNING, "SRESET: Unable to reset self\n");
+		return OPAL_UNSUPPORTED;
+	}
+	if (this_cpu()->primary == cpu->primary) {
+		prlog(PR_WARNING, "SRESET: Unable to reset threads on same core\n");
+		return OPAL_PARTIAL;
+	}
+
+	rc = thread_quiesced(cpu);
+	if (rc < 0)
+		return rc;
+	if (rc) {
+		prlog(PR_WARNING, "SRESET: Thread is quiesced already\n");
+		return OPAL_WRONG_STATE;
+	}
+
+	rc = core_set_special_wakeup(cpu);
+	if (rc)
+		return rc;
+
+	rc = stop_thread(cpu);
+	if (rc) {
+		core_clear_special_wakeup(cpu);
+		return rc;
+	}
+
+	rc = sreset_thread(cpu);
+
+	core_clear_special_wakeup(cpu);
+
+	return 0;
+}
+
+int64_t signal_system_reset(int cpu_nr)
+{
+	struct cpu_thread *cpu;
+
+	if (proc_gen != proc_gen_p9)
+		return OPAL_UNSUPPORTED;
+
+	/* Reset a single CPU */
+	if (cpu_nr >= 0) {
+		cpu = find_cpu_by_server(cpu_nr);
+		if (!cpu) {
+			printf("SRESET: could not find cpu by server %d\n", cpu_nr);
+			return OPAL_PARAMETER;
+		}
+		return sreset_cpu(cpu);
+	}
+	printf("SRESET: unsupported %d\n", cpu_nr);
+	return OPAL_PARTIAL;
+}
diff --git a/hw/xscom.c b/hw/xscom.c
index 7bd78bf9..f3e04291 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -705,6 +705,8 @@ static void xscom_init_chip_info(struct proc_chip *chip)
 		printf("P9 DD%i.%i%d detected\n", 0xf & (chip->ec_level >> 4),
 		       chip->ec_level & 0xf, rev);
 		chip->ec_rev = rev;
+
+		opal_register(OPAL_SIGNAL_SYSTEM_RESET, signal_system_reset, 1);
 	}
 }
 
diff --git a/include/skiboot.h b/include/skiboot.h
index 4b7d5197..37fd774f 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -198,6 +198,9 @@ extern char __sym_map_end[];
 extern unsigned long get_symbol(unsigned long addr,
 				char **sym, char **sym_end);
 
+/* System reset */
+extern int64_t signal_system_reset(int cpu_nr);
+
 /* Fast reboot support */
 extern void disable_fast_reboot(const char *reason);
 extern void fast_reboot(void);
diff --git a/platforms/mambo/mambo.c b/platforms/mambo/mambo.c
index cb6e103c..e306ba5c 100644
--- a/platforms/mambo/mambo.c
+++ b/platforms/mambo/mambo.c
@@ -259,7 +259,8 @@ static int64_t mambo_signal_system_reset(int32_t cpu_nr)
 
 static void mambo_sreset_init(void)
 {
-	opal_register(OPAL_SIGNAL_SYSTEM_RESET, mambo_signal_system_reset, 1);
+	if (0)
+		opal_register(OPAL_SIGNAL_SYSTEM_RESET, mambo_signal_system_reset, 1);
 }
 
 static void mambo_platform_init(void)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET
  2017-09-12 16:05 [RFC PATCH 0/2] NMI IPI work in progress for Linux and OPAL Nicholas Piggin
  2017-09-12 16:05 ` [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms Nicholas Piggin
@ 2017-09-12 16:05 ` Nicholas Piggin
  2017-09-13 13:13   ` Nicholas Piggin
  2017-09-14 11:26   ` Nicholas Piggin
  1 sibling, 2 replies; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-12 16:05 UTC (permalink / raw)
  To: linuxppc-dev, skiboot
  Cc: Nicholas Piggin, Benjamin Herrenschmidt, Alistair Popple

There are two complications. The first is that sreset from stop states
come in with SRR1 set to do a powersave wakeup, with an sreset reason
encoded.

The second is that threads on the same core can't be signalled directly
so we must designate a bounce CPU to reflect the IPI back.
---
 arch/powerpc/include/asm/opal-api.h            |   1 +
 arch/powerpc/include/asm/opal.h                |   2 +
 arch/powerpc/kernel/irq.c                      |  13 +++
 arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
 arch/powerpc/platforms/powernv/powernv.h       |   1 +
 arch/powerpc/platforms/powernv/setup.c         |   3 +
 arch/powerpc/platforms/powernv/smp.c           | 111 +++++++++++++++++++++++++
 7 files changed, 132 insertions(+)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..bd9d1f2b3584 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@
 #define OPAL_XIVE_DUMP				142
 #define OPAL_XIVE_RESERVED3			143
 #define OPAL_XIVE_RESERVED4			144
+#define OPAL_SIGNAL_SYSTEM_RESET 		145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
 #define OPAL_NPU_MAP_LPAR			148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..7d7613c49f2b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+int64_t opal_signal_system_reset(int32_t cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..3276e05cb53f 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -407,10 +407,23 @@ static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
+static noinline void system_reset(void)
+{
+	struct pt_regs regs;
+	ppc_save_regs(&regs);
+
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
 	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
 
+	if (unlikely(idx == 2 || idx == 4))
+		system_reset();
+
 	/*
 	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
 	 * so this can be called unconditionally with srr1 wake reason.
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index a159d48573d7..49add2037e0d 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_SMP
 extern void pnv_smp_init(void);
+extern int pnv_system_reset_exception(struct pt_regs *regs);
 #else
 static inline void pnv_smp_init(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..4fdaa1d7c4cd 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,9 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+#ifdef CONFIG_SMP
+	ppc_md.system_reset_exception = pnv_system_reset_exception;
+#endif
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..45b1c191e3c8 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -290,6 +290,112 @@ static void __init pnv_smp_probe(void)
 	}
 }
 
+static int nmi_ipi_bounce_cpu;
+static int nmi_ipi_bounce_cpu_done;
+static int nmi_ipi_bounce_target_core;
+static int nmi_ipi_bounce_target_exclude;
+
+int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	smp_mb();
+	if (nmi_ipi_bounce_cpu == smp_processor_id()) {
+		int64_t rc;
+		int c;
+
+		nmi_ipi_bounce_cpu = -1;
+		smp_mb();
+		for_each_online_cpu(c) {
+			if (!cpumask_test_cpu(c, cpu_sibling_mask(nmi_ipi_bounce_target_core)))
+				continue;
+			if (c == nmi_ipi_bounce_target_exclude)
+				continue;
+			rc = opal_signal_system_reset(get_hard_smp_processor_id(c));
+			if (rc != OPAL_SUCCESS) {
+				nmi_ipi_bounce_cpu_done = -1;
+				return 1;
+			}
+		}
+		nmi_ipi_bounce_cpu_done = 1;
+	}
+
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	if (cpu >= 0) {
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+		if (rc == OPAL_SUCCESS)
+			return 1;
+		return 0;
+	} else {
+		/*
+		 * Test bounce behavior with broadcast IPI.
+		 */
+		rc = OPAL_PARTIAL;
+	}
+	if (rc == OPAL_PARTIAL) {
+		int c;
+
+		/*
+		 * Some platforms can not send NMI to sibling threads in
+		 * the same core. We can designate one inter-core target
+		 * to bounce NMIs back to our sibling threads.
+		 */
+
+		if (cpu >= 0) {
+			/*
+			 * Don't support bouncing unicast NMIs yet (because
+			 * that would have to raise an NMI on an unrelated
+			 * CPU. Revisit this if callers start using unicast.
+			 */
+			printk("CPU:%d pnv_cause_nmi_ipi can not bounce unicast IPIs!\n", smp_processor_id());
+			return 0;
+		}
+
+		nmi_ipi_bounce_cpu = -1;
+		nmi_ipi_bounce_cpu_done = 0;
+		nmi_ipi_bounce_target_core = -1;
+		nmi_ipi_bounce_target_exclude = -1;
+
+		for_each_online_cpu(c) {
+			if (cpumask_test_cpu(c, cpu_sibling_mask(smp_processor_id())))
+				continue;
+
+			if (nmi_ipi_bounce_cpu == -1) {
+				nmi_ipi_bounce_cpu = c;
+				nmi_ipi_bounce_target_core = smp_processor_id();
+				if (cpu == NMI_IPI_ALL_OTHERS)
+					nmi_ipi_bounce_target_exclude = smp_processor_id();
+				smp_mb();
+			} else {
+				rc = opal_signal_system_reset(get_hard_smp_processor_id(c));
+				if (rc != OPAL_SUCCESS)
+					return 0;
+			}
+		}
+
+		if (nmi_ipi_bounce_cpu == -1)
+			return 0; /* could not find a bouncer */
+
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(nmi_ipi_bounce_cpu));
+		if (rc != OPAL_SUCCESS)
+			return 0;
+
+		while (!nmi_ipi_bounce_cpu_done)
+			cpu_relax();
+
+		if (nmi_ipi_bounce_cpu_done == 1)
+			return 1; /* bounce worked */
+	}
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +414,11 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) {
+		printk("OPAL_SIGNAL_SYSTEM_RESET available\n");
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
+	} else
+		printk("OPAL_SIGNAL_SYSTEM_RESET NOT available\n");
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms
  2017-09-12 16:05 ` [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms Nicholas Piggin
@ 2017-09-12 23:18   ` Benjamin Herrenschmidt
  2017-09-13 13:27     ` Nicholas Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Benjamin Herrenschmidt @ 2017-09-12 23:18 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev, skiboot; +Cc: Alistair Popple

On Wed, 2017-09-13 at 02:05 +1000, Nicholas Piggin wrote:
> This implements a way to raise system reset interrupts on other
> cores. This has not yet been tested on DD2 or with deeper sleep
> states.

Reminds me, we need to workaround a bug with XSCOMs on P9

PSCOMs to core in the range 20010A80-20010Ab8 (list below) can fail
occasionally with an error of 4 (PCB_ADDRESS_ERROR). We need to
(silently) retry up to 32 times.

> 0000000020010A80 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SCOMC
> 0000000020010A81 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SCOMD
> 0000000020010A82 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.OCC_SCOMC
> 0000000020010A83 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.OCC_SCOMD
> 0000000020010A84 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SPR_MODE
> 0000000020010A85 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.CTRL
> 0000000020010A86 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SCR0
> 0000000020010A87 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SCR1
> 0000000020010A88 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SCR2
> 0000000020010A89 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SCR3
> 0000000020010A8E EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V0_HMER
> 0000000020010A92 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V0_HMER
> 0000000020010A8F EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V1_HMER
> 0000000020010A93 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V1_HMER
> 0000000020010A90 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V2_HMER
> 0000000020010A94 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V2_HMER
> 0000000020010A91 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V3_HMER
> 0000000020010A95 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.V3_HMER
> 0000000020010A96 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.HMEER
> 0000000020010A97 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SPATTN
> 0000000020010A98 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SPATTN
> 0000000020010A99 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SPATTN
> 0000000020010A9A EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SPATTN_MASK
> 0000000020010A9B EXP.EC.CC.PCC0.PMC.THREAD_INFO
> 0000000020010A9C EXP.EC.CC.PCC0.PMC.DIRECT_CONTROLS
> 0000000020010A9D ECP.PC.PMU.SPR_CORE.RAS_MODEREG
> 0000000020010A9E EXP.EC.CC.PCC0.COMMON.POW.THROTTLE_CONTROL
> 0000000020010A9F EXP.EC.CC.PCC0.TFDP.TFP.SPURR_FREQ_DETECT_CYC_CNT
> 0000000020010AA0 EXP.EC.CC.PCC0.TFDP.TFP.SPURR_FREQ_SCALE
> 0000000020010AA1 EXP.EC.CC.PCC0.TFDP.TFP.SPURR_FREQ_REF
> 0000000020010AA2 EXP.EC.CC.PCC0.TFDP.TFP.PWM_EVENTS
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_READ
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC000
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC001
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC010
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC011
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC100
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC101
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC110
> 0000000020010AA3 EXP.EC.CC.PCC0.TOD_SYNC111
> 0000000020010AA4 EXP.EC.CC.PCC0.COMMON.TFC.TOD_STEP_CHECK
> 0000000020010AA5 ECP.PC.PMU.SPR_CORE.SHID0
> 0000000020010AA6 ECP.PC.PMU.SPR_CORE.HV_STATE
> 0000000020010AA7 ECP.PC.PMU.SPR_CORE.CORE_FUSES
> 0000000020010AA8 ECP.PC.IMA.IMA_EVENT_MASK
> 0000000020010AA9 ECP.PC.IMA.IMA_TRACE
> 0000000020010AAA ECP.PC.T0_PMU_SCOM
> 0000000020010AAB ECP.PC.T1_PMU_SCOM
> 0000000020010AAC ECP.PC.T2_PMU_SCOM
> 0000000020010AAD ECP.PC.T3_PMU_SCOM
> 0000000020010AAE ECP.PC.PMU.PMUC.SIER_MASK
> 0000000020010AAF ECP.PC.PMU.PMUC.SRC_MASK
> 0000000020010AB0 ECP.PC.PMU.SPR_CORE.PMU_SCOMC
> 0000000020010AB2 ECP.PC.PMU.SPR_CORE.PMU_SCOMC_EN
> 0000000020010AB3 EXP.EC.CC.PCC0.PMC.CORE_THREAD_STATE
> 0000000020010AB4 ECP.PC.PMU.SPR_CORE.INV_ERATE
> 0000000020010AB5 ECP.PC.PMU.SPR_CORE.SPR_CORE_HOLD_OUT
> 0000000020010AB6 ECP.PC.PMU.SPR_CORE.PMU_HOLD_OUT
> 0000000020010AB7 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.TFAC_HOLD_OUT
> 0000000020010AB8 EXP.EC.CC.PCC0.COMMON.SPR_COMMON.SPR_COMMON_HOLD_OUT 
> ---
>  core/Makefile.inc       |   1 +
>  core/sreset.c           | 237 ++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/xscom.c              |   2 +
>  include/skiboot.h       |   3 +
>  platforms/mambo/mambo.c |   3 +-
>  5 files changed, 245 insertions(+), 1 deletion(-)
>  create mode 100644 core/sreset.c
> 
> diff --git a/core/Makefile.inc b/core/Makefile.inc
> index f2de2f64..16204978 100644
> --- a/core/Makefile.inc
> +++ b/core/Makefile.inc
> @@ -9,6 +9,7 @@ CORE_OBJS += vpd.o hostservices.o platform.o nvram.o nvram-format.o hmi.o
>  CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
>  CORE_OBJS += timer.o i2c.o rtc.o flash.o sensor.o ipmi-opal.o
>  CORE_OBJS += flash-subpartition.o bitmap.o buddy.o pci-quirk.o powercap.o psr.o
> +CORE_OBJS += sreset.o
>  
>  ifeq ($(SKIBOOT_GCOV),1)
>  CORE_OBJS += gcov-profiling.o
> diff --git a/core/sreset.c b/core/sreset.c
> new file mode 100644
> index 00000000..ff20fe71
> --- /dev/null
> +++ b/core/sreset.c
> @@ -0,0 +1,237 @@
> +/* Copyright 2017 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + * 	http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <skiboot.h>
> +#include <cpu.h>
> +#include <fsp.h>
> +#include <psi.h>
> +#include <opal.h>
> +#include <xscom.h>
> +#include <interrupts.h>
> +#include <cec.h>
> +#include <timebase.h>
> +#include <pci.h>
> +#include <chip.h>
> +#include <chiptod.h>
> +#include <ipmi.h>
> +
> +#define P9_RAS_STATUS			0x10a02
> +#define P9_RSTAT_QUIESCED(t)		PPC_BITMASK(0 + 8*(t), 3 + 8*(t))
> +#define P9_RAS_MODEREG			0x10a9d
> +#define P9_DIRECT_CONTROLS		0x10a9c
> +#define P9_DCTL_STOP(t)			PPC_BIT(7 + 8*(t))
> +#define P9_DCTL_CONT(t)			PPC_BIT(6 + 8*(t))
> +#define P9_DCTL_SRESET(t)		PPC_BIT(4 + 8*(t))
> +#define P9_DCTL_PWR(t)			PPC_BIT(32 + 8*(t))
> +
> +#define P9_CORE_THREAD_STATE		0x10ab3
> +#define P9_CTS_STOP(t)			PPC_BIT(56 + (t))
> +
> +#define PPM_GPMMR			0xf0100
> +#define PPM_SPWKUP_OTR			0xf010a
> +#define SPECIAL_WKUP_DONE		PPC_BIT(1)
> +
> +
> +static int core_set_special_wakeup(struct cpu_thread *cpu)
> +{
> +	uint32_t chip_id = pir_to_chip_id(cpu->pir);
> +	uint32_t core_id = pir_to_core_id(cpu->pir);
> +	uint32_t swake_addr;
> +	uint32_t gpmmr_addr;
> +	uint64_t val;
> +	int i;
> +
> +	swake_addr = XSCOM_ADDR_P9_EC(core_id, PPM_SPWKUP_OTR);
> +	gpmmr_addr = XSCOM_ADDR_P9_EC(core_id, PPM_GPMMR);
> +
> +	xscom_read(chip_id, swake_addr, &val);
> +	if (xscom_write(chip_id, swake_addr, PPC_BIT(0))) {
> +		prlog(PR_WARNING, "SRESET: Unable to write SPWKUP_OTR register\n");
> +		return OPAL_HARDWARE;
> +	}
> +	xscom_read(chip_id, swake_addr, &val);
> +
> +	for (i = 0; i < 100; i++) {
> +		if (xscom_read(chip_id, gpmmr_addr, &val)) {
> +			prlog(PR_WARNING, "SRESET: Unable to read GPMMR register\n");
> +			return OPAL_HARDWARE;
> +		}
> +		if (val & SPECIAL_WKUP_DONE)
> +			return 0;
> +
> +		time_wait_us(1);
> +	}
> +
> +	xscom_read(chip_id, swake_addr, &val);
> +	xscom_write(chip_id, swake_addr, 0);
> +	xscom_read(chip_id, swake_addr, &val);
> +
> +	prlog(PR_WARNING, "SRESET: Special wakeup mode could not be set.\n");
> +	return OPAL_HARDWARE;
> +}
> +
> +static void core_clear_special_wakeup(struct cpu_thread *cpu)
> +{
> +	uint32_t chip_id = pir_to_chip_id(cpu->pir);
> +	uint32_t core_id = pir_to_core_id(cpu->pir);
> +	uint32_t swake_addr;
> +	uint64_t val;
> +
> +	swake_addr = XSCOM_ADDR_P9_EC(core_id, PPM_SPWKUP_OTR);
> +
> +	/* De-assert special wakeup bit */
> +	xscom_read(chip_id, swake_addr, &val);
> +	xscom_write(chip_id, swake_addr, 0);
> +	xscom_read(chip_id, swake_addr, &val);
> +}
> +
> +static int thread_quiesced(struct cpu_thread *cpu)
> +{
> +	uint32_t chip_id = pir_to_chip_id(cpu->pir);
> +	uint32_t core_id = pir_to_core_id(cpu->pir);
> +	uint32_t thread_id = pir_to_thread_id(cpu->pir);
> +	uint32_t ras_addr;
> +	uint64_t ras_status;
> +
> +	ras_addr = XSCOM_ADDR_P9_EC(core_id, P9_RAS_STATUS);
> +	if (xscom_read(chip_id, ras_addr, &ras_status)) {
> +		prlog(PR_WARNING, "SRESET: Unable to read status register\n");
> +		return OPAL_HARDWARE;
> +	}
> +
> +	if ((ras_status & P9_RSTAT_QUIESCED(thread_id))
> +		   	== P9_RSTAT_QUIESCED(thread_id))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int stop_thread(struct cpu_thread *cpu)
> +{
> +	uint32_t chip_id = pir_to_chip_id(cpu->pir);
> +	uint32_t core_id = pir_to_core_id(cpu->pir);
> +	uint32_t thread_id = pir_to_thread_id(cpu->pir);
> +	uint32_t dctl_addr;
> +	int i;
> +
> +	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_DIRECT_CONTROLS);
> +
> +	xscom_write(chip_id, dctl_addr, P9_DCTL_STOP(thread_id));
> +
> +	for (i = 0; i < 100; i++) {
> +		int rc = thread_quiesced(cpu);
> +		if (rc < 0)
> +			break;
> +		if (rc)
> +			return 0;
> +	}
> +
> +	xscom_write(chip_id, dctl_addr, P9_DCTL_CONT(thread_id));
> +	prlog(PR_WARNING, "SRESET: Could not quiesce thread\n");
> +	return OPAL_HARDWARE;
> +}
> +
> +static int sreset_thread(struct cpu_thread *cpu)
> +{
> +	uint32_t chip_id = pir_to_chip_id(cpu->pir);
> +	uint32_t core_id = pir_to_core_id(cpu->pir);
> +	uint32_t thread_id = pir_to_thread_id(cpu->pir);
> +	uint32_t dctl_addr;
> +	uint32_t cts_addr;
> +	uint64_t cts_val;
> +
> +	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_DIRECT_CONTROLS);
> +	cts_addr = XSCOM_ADDR_P9_EC(core_id, P9_CORE_THREAD_STATE);
> +
> +	if (xscom_read(chip_id, cts_addr, &cts_val)) {
> +		prlog(PR_WARNING, "SRESET: Unable to read CORE_THREAD_STATE register\n");
> +		return OPAL_HARDWARE;
> +	}
> +	if (!(cts_val & P9_CTS_STOP(thread_id))) {
> +		/* Clear SRR1[46:47] */
> +		if (xscom_write(chip_id, dctl_addr, P9_DCTL_PWR(thread_id))) {
> +			prlog(PR_WARNING, "SRESET: Unable to set power saving mode\n");
> +			return OPAL_HARDWARE;
> +		}
> +	}
> +
> +	if (xscom_write(chip_id, dctl_addr, P9_DCTL_SRESET(thread_id))) {
> +		prlog(PR_WARNING, "SRESET: Unable to write DIRECT_CONTROLS register\n");
> +		return OPAL_HARDWARE;
> +	}
> +
> +	return 0;
> +}
> +
> +// static struct lock sreset_lock = LOCK_UNLOCKED;
> +
> +static int64_t sreset_cpu(struct cpu_thread *cpu)
> +{
> +	int rc;
> +
> +	if (this_cpu() == cpu) {
> +		prlog(PR_WARNING, "SRESET: Unable to reset self\n");
> +		return OPAL_UNSUPPORTED;
> +	}
> +	if (this_cpu()->primary == cpu->primary) {
> +		prlog(PR_WARNING, "SRESET: Unable to reset threads on same core\n");
> +		return OPAL_PARTIAL;
> +	}
> +
> +	rc = thread_quiesced(cpu);
> +	if (rc < 0)
> +		return rc;
> +	if (rc) {
> +		prlog(PR_WARNING, "SRESET: Thread is quiesced already\n");
> +		return OPAL_WRONG_STATE;
> +	}
> +
> +	rc = core_set_special_wakeup(cpu);
> +	if (rc)
> +		return rc;
> +
> +	rc = stop_thread(cpu);
> +	if (rc) {
> +		core_clear_special_wakeup(cpu);
> +		return rc;
> +	}
> +
> +	rc = sreset_thread(cpu);
> +
> +	core_clear_special_wakeup(cpu);
> +
> +	return 0;
> +}
> +
> +int64_t signal_system_reset(int cpu_nr)
> +{
> +	struct cpu_thread *cpu;
> +
> +	if (proc_gen != proc_gen_p9)
> +		return OPAL_UNSUPPORTED;
> +
> +	/* Reset a single CPU */
> +	if (cpu_nr >= 0) {
> +		cpu = find_cpu_by_server(cpu_nr);
> +		if (!cpu) {
> +			printf("SRESET: could not find cpu by server %d\n", cpu_nr);
> +			return OPAL_PARAMETER;
> +		}
> +		return sreset_cpu(cpu);
> +	}
> +	printf("SRESET: unsupported %d\n", cpu_nr);
> +	return OPAL_PARTIAL;
> +}
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 7bd78bf9..f3e04291 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -705,6 +705,8 @@ static void xscom_init_chip_info(struct proc_chip *chip)
>  		printf("P9 DD%i.%i%d detected\n", 0xf & (chip->ec_level >> 4),
>  		       chip->ec_level & 0xf, rev);
>  		chip->ec_rev = rev;
> +
> +		opal_register(OPAL_SIGNAL_SYSTEM_RESET, signal_system_reset, 1);
>  	}
>  }
>  
> diff --git a/include/skiboot.h b/include/skiboot.h
> index 4b7d5197..37fd774f 100644
> --- a/include/skiboot.h
> +++ b/include/skiboot.h
> @@ -198,6 +198,9 @@ extern char __sym_map_end[];
>  extern unsigned long get_symbol(unsigned long addr,
>  				char **sym, char **sym_end);
>  
> +/* System reset */
> +extern int64_t signal_system_reset(int cpu_nr);
> +
>  /* Fast reboot support */
>  extern void disable_fast_reboot(const char *reason);
>  extern void fast_reboot(void);
> diff --git a/platforms/mambo/mambo.c b/platforms/mambo/mambo.c
> index cb6e103c..e306ba5c 100644
> --- a/platforms/mambo/mambo.c
> +++ b/platforms/mambo/mambo.c
> @@ -259,7 +259,8 @@ static int64_t mambo_signal_system_reset(int32_t cpu_nr)
>  
>  static void mambo_sreset_init(void)
>  {
> -	opal_register(OPAL_SIGNAL_SYSTEM_RESET, mambo_signal_system_reset, 1);
> +	if (0)
> +		opal_register(OPAL_SIGNAL_SYSTEM_RESET, mambo_signal_system_reset, 1);
>  }
>  
>  static void mambo_platform_init(void)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET
  2017-09-12 16:05 ` [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET Nicholas Piggin
@ 2017-09-13 13:13   ` Nicholas Piggin
  2017-09-14  2:24     ` Benjamin Herrenschmidt
  2017-09-14 11:26   ` Nicholas Piggin
  1 sibling, 1 reply; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-13 13:13 UTC (permalink / raw)
  To: linuxppc-dev, skiboot; +Cc: Benjamin Herrenschmidt, Alistair Popple

On Wed, 13 Sep 2017 02:05:53 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> There are two complications. The first is that sreset from stop states
> come in with SRR1 set to do a powersave wakeup, with an sreset reason
> encoded.
> 
> The second is that threads on the same core can't be signalled directly
> so we must designate a bounce CPU to reflect the IPI back.

Here is an updated Linux patch for the latest OPAL patch. This has
a few assorted fixes as well to make it work nicely, I roll them into
one patch here to make it easy to apply for testing the OPAL patch.

Thanks,
Nick

---
 arch/powerpc/include/asm/opal-api.h            |  1 +
 arch/powerpc/include/asm/opal.h                |  2 +
 arch/powerpc/kernel/irq.c                      | 18 ++++++
 arch/powerpc/kernel/watchdog.c                 | 30 +++++++--
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/powernv.h       |  1 +
 arch/powerpc/platforms/powernv/setup.c         |  3 +
 arch/powerpc/platforms/powernv/smp.c           | 89 ++++++++++++++++++++++++++
 arch/powerpc/xmon/xmon.c                       | 17 +++--
 9 files changed, 151 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..e39f4236b413 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@
 #define OPAL_XIVE_DUMP				142
 #define OPAL_XIVE_RESERVED3			143
 #define OPAL_XIVE_RESERVED4			144
+#define OPAL_SIGNAL_SYSTEM_RESET                145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
 #define OPAL_NPU_MAP_LPAR			148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..7d7613c49f2b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+int64_t opal_signal_system_reset(int32_t cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..5f2c0367bab2 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -407,10 +407,28 @@ static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
+/*
+ * System reset does not have to wait for Linux interrupts
+ * to be re-enabled, so just replay it now.
+ */
+static noinline void replay_system_reset(void)
+{
+	struct pt_regs regs;
+
+	ppc_save_regs(&regs);
+
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
 	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
 
+	if (unlikely(idx == 4))
+		replay_system_reset();
+
 	/*
 	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
 	 * so this can be called unconditionally with srr1 wake reason.
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..a6aa85b0cdeb 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -61,6 +61,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
  */
 static unsigned long __wd_smp_lock;
 static cpumask_t wd_smp_cpus_pending;
+static cpumask_t wd_smp_cpus_stuck_tmp;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
 
@@ -97,8 +98,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	else
 		dump_stack();
 
-	if (hardlockup_panic)
-		nmi_panic(regs, "Hard LOCKUP");
+	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
 static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
@@ -136,16 +136,29 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 
 	/*
 	 * Try to trigger the stuck CPUs.
+	 *
+	 * There is a bit of a hack for OPAL here because it can not
+	 * signal sibling threads. Don't try to signal those or mark
+	 * them stuck, in the hope that another core will notice.
 	 */
+	cpumask_clear(&wd_smp_cpus_stuck_tmp);
 	for_each_cpu(c, &wd_smp_cpus_pending) {
 		if (c == cpu)
 			continue;
-		smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+		if (firmware_has_feature(FW_FEATURE_OPAL)) {
+			if (cpumask_test_cpu(c, cpu_sibling_mask(cpu)))
+				continue;
+		}
+		cpumask_set_cpu(c, &wd_smp_cpus_stuck_tmp);
+		if (!sysctl_hardlockup_all_cpu_backtrace)
+			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
 	}
-	smp_flush_nmi_ipi(1000000);
 
 	/* Take the stuck CPUs out of the watch group */
-	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
+	set_cpumask_stuck(&wd_smp_cpus_stuck_tmp, tb);
+
+	if (!sysctl_hardlockup_all_cpu_backtrace)
+		smp_flush_nmi_ipi(1000000);
 
 	wd_smp_unlock(&flags);
 
@@ -275,9 +288,12 @@ void arch_touch_nmi_watchdog(void)
 {
 	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
+	u64 tb = get_tb();
 
-	if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
-		watchdog_timer_interrupt(cpu);
+	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+		per_cpu(wd_timer_tb, cpu) = tb;
+		wd_smp_clear_cpu_pending(cpu, tb);
+	}
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index a159d48573d7..49add2037e0d 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_SMP
 extern void pnv_smp_init(void);
+extern int pnv_system_reset_exception(struct pt_regs *regs);
 #else
 static inline void pnv_smp_init(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..4fdaa1d7c4cd 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,9 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+#ifdef CONFIG_SMP
+	ppc_md.system_reset_exception = pnv_system_reset_exception;
+#endif
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..9da97962c93a 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -290,6 +290,93 @@ static void __init pnv_smp_probe(void)
 	}
 }
 
+static int nmi_ipi_bounce_cpu;
+static int nmi_ipi_bounce_target_core;
+static int nmi_ipi_bounce_target_exclude;
+
+int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	if (nmi_ipi_bounce_cpu == smp_processor_id()) {
+		int c;
+		nmi_ipi_bounce_cpu = -1;
+		for_each_online_cpu(c) {
+			if (!cpumask_test_cpu(c, cpu_sibling_mask(
+						nmi_ipi_bounce_target_core)))
+				continue;
+			if (c == nmi_ipi_bounce_target_exclude)
+				continue;
+			opal_signal_system_reset(
+					get_hard_smp_processor_id(c));
+			/* can't do much with failure here */
+		}
+	}
+
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	if (cpu >= 0) {
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+		if (rc == OPAL_SUCCESS)
+			return 1;
+		return 0;
+	} else {
+		int c;
+
+		/*
+		 * Some platforms can not send NMI to sibling threads in
+		 * the same core. We can designate one inter-core target
+		 * to bounce NMIs back to our sibling threads.
+		 */
+
+		if (cpu >= 0) {
+			/*
+			 * Don't support bouncing unicast NMIs yet (because
+			 * that would have to raise an NMI on an unrelated
+			 * CPU. Revisit this if callers start using unicast.
+			 */
+			return 0;
+		}
+
+		nmi_ipi_bounce_cpu = -1;
+		nmi_ipi_bounce_target_core = -1;
+		nmi_ipi_bounce_target_exclude = -1;
+
+		for_each_online_cpu(c) {
+			if (cpumask_test_cpu(c, cpu_sibling_mask(smp_processor_id())))
+				continue;
+
+			if (nmi_ipi_bounce_cpu == -1) {
+				nmi_ipi_bounce_cpu = c;
+				nmi_ipi_bounce_target_core = smp_processor_id();
+				if (cpu == NMI_IPI_ALL_OTHERS)
+					nmi_ipi_bounce_target_exclude = smp_processor_id();
+				smp_mb();
+			} else {
+				rc = opal_signal_system_reset(
+						get_hard_smp_processor_id(c));
+				if (rc != OPAL_SUCCESS)
+					return 0;
+			}
+		}
+
+		if (nmi_ipi_bounce_cpu == -1)
+			return 0; /* could not find a bouncer */
+		rc = opal_signal_system_reset(
+				get_hard_smp_processor_id(nmi_ipi_bounce_cpu));
+		if (rc != OPAL_SUCCESS)
+			return 0;
+		return 1;
+	}
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +395,8 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET))
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 33351c6704b1..d9a12102b111 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 
  waiting:
 	secondary = 1;
+	spin_begin();
 	while (secondary && !xmon_gate) {
 		if (in_xmon == 0) {
-			if (fromipi)
+			if (fromipi) {
+				spin_end();
 				goto leave;
+			}
 			secondary = test_and_set_bit(0, &in_xmon);
 		}
-		barrier();
+		spin_cpu_relax();
+		touch_nmi_watchdog();
 	}
+	spin_end();
 
 	if (!secondary && !xmon_gate) {
 		/* we are the first cpu to come in */
@@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		mb();
 		xmon_gate = 1;
 		barrier();
+		touch_nmi_watchdog();
 	}
 
  cmdloop:
 	while (in_xmon) {
 		if (secondary) {
+			spin_begin();
 			if (cpu == xmon_owner) {
 				if (!test_and_set_bit(0, &xmon_taken)) {
 					secondary = 0;
+					spin_end();
 					continue;
 				}
 				/* missed it */
 				while (cpu == xmon_owner)
-					barrier();
+					spin_cpu_relax();
 			}
-			barrier();
+			spin_cpu_relax();
+			touch_nmi_watchdog();
 		} else {
 			cmd = cmds(regs);
 			if (cmd != 0) {
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms
  2017-09-12 23:18   ` Benjamin Herrenschmidt
@ 2017-09-13 13:27     ` Nicholas Piggin
  2017-09-14  2:27       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-13 13:27 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, skiboot, Alistair Popple

On Wed, 13 Sep 2017 09:18:34 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2017-09-13 at 02:05 +1000, Nicholas Piggin wrote:
> > This implements a way to raise system reset interrupts on other
> > cores. This has not yet been tested on DD2 or with deeper sleep
> > states.  
> 
> Reminds me, we need to workaround a bug with XSCOMs on P9
> 
> PSCOMs to core in the range 20010A80-20010Ab8 (list below) can fail
> occasionally with an error of 4 (PCB_ADDRESS_ERROR). We need to
> (silently) retry up to 32 times.

[snip]

So, just put a loop into xscom_read and xscom_write for those
addresses for P9 chips?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET
  2017-09-13 13:13   ` Nicholas Piggin
@ 2017-09-14  2:24     ` Benjamin Herrenschmidt
  2017-09-14  6:32       ` Nicholas Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Benjamin Herrenschmidt @ 2017-09-14  2:24 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev, skiboot; +Cc: Alistair Popple

On Wed, 2017-09-13 at 23:13 +1000, Nicholas Piggin wrote:
> On Wed, 13 Sep 2017 02:05:53 +1000
> Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> > There are two complications. The first is that sreset from stop states
> > come in with SRR1 set to do a powersave wakeup, with an sreset reason
> > encoded.
> > 
> > The second is that threads on the same core can't be signalled directly
> > so we must designate a bounce CPU to reflect the IPI back.
> 
> Here is an updated Linux patch for the latest OPAL patch. This has
> a few assorted fixes as well to make it work nicely, I roll them into
> one patch here to make it easy to apply for testing the OPAL patch.

Why can't you sreset threads of the same core on P9 ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms
  2017-09-13 13:27     ` Nicholas Piggin
@ 2017-09-14  2:27       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 11+ messages in thread
From: Benjamin Herrenschmidt @ 2017-09-14  2:27 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linuxppc-dev, skiboot, Alistair Popple

On Wed, 2017-09-13 at 23:27 +1000, Nicholas Piggin wrote:
> On Wed, 13 Sep 2017 09:18:34 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Wed, 2017-09-13 at 02:05 +1000, Nicholas Piggin wrote:
> > > This implements a way to raise system reset interrupts on other
> > > cores. This has not yet been tested on DD2 or with deeper sleep
> > > states.  
> > 
> > Reminds me, we need to workaround a bug with XSCOMs on P9
> > 
> > PSCOMs to core in the range 20010A80-20010Ab8 (list below) can fail
> > occasionally with an error of 4 (PCB_ADDRESS_ERROR). We need to
> > (silently) retry up to 32 times.
> 
> [snip]
> 
> So, just put a loop into xscom_read and xscom_write for those
> addresses for P9 chips?

Right. Well, the top bit of the address needs filtering since it's the
target core, ie, 0x20 is core 0, 0x21 is core 1 etc... to 0x37.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET
  2017-09-14  2:24     ` Benjamin Herrenschmidt
@ 2017-09-14  6:32       ` Nicholas Piggin
  2017-09-14  6:43         ` Alistair Popple
  0 siblings, 1 reply; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-14  6:32 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, skiboot, Alistair Popple

On Thu, 14 Sep 2017 12:24:49 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2017-09-13 at 23:13 +1000, Nicholas Piggin wrote:
> > On Wed, 13 Sep 2017 02:05:53 +1000
> > Nicholas Piggin <npiggin@gmail.com> wrote:
> >   
> > > There are two complications. The first is that sreset from stop states
> > > come in with SRR1 set to do a powersave wakeup, with an sreset reason
> > > encoded.
> > > 
> > > The second is that threads on the same core can't be signalled directly
> > > so we must designate a bounce CPU to reflect the IPI back.  
> > 
> > Here is an updated Linux patch for the latest OPAL patch. This has
> > a few assorted fixes as well to make it work nicely, I roll them into
> > one patch here to make it easy to apply for testing the OPAL patch.  
> 
> Why can't you sreset threads of the same core on P9 ?

It looks like we can, I think I had some other bugs still not ironed
out when I previously tested it.

That simplifies things a lot on the Linux side. It may be that the
bounce is still required if we implement it on POWER8 using ramming,
but I'll get the POWER9 code in first.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET
  2017-09-14  6:32       ` Nicholas Piggin
@ 2017-09-14  6:43         ` Alistair Popple
  0 siblings, 0 replies; 11+ messages in thread
From: Alistair Popple @ 2017-09-14  6:43 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: Benjamin Herrenschmidt, linuxppc-dev, skiboot

On Thu, 14 Sep 2017 04:32:28 PM Nicholas Piggin wrote:
> On Thu, 14 Sep 2017 12:24:49 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Wed, 2017-09-13 at 23:13 +1000, Nicholas Piggin wrote:
> > > On Wed, 13 Sep 2017 02:05:53 +1000
> > > Nicholas Piggin <npiggin@gmail.com> wrote:
> > >   
> > > > There are two complications. The first is that sreset from stop states
> > > > come in with SRR1 set to do a powersave wakeup, with an sreset reason
> > > > encoded.
> > > > 
> > > > The second is that threads on the same core can't be signalled directly
> > > > so we must designate a bounce CPU to reflect the IPI back.  
> > > 
> > > Here is an updated Linux patch for the latest OPAL patch. This has
> > > a few assorted fixes as well to make it work nicely, I roll them into
> > > one patch here to make it easy to apply for testing the OPAL patch.  
> > 
> > Why can't you sreset threads of the same core on P9 ?
> 
> It looks like we can, I think I had some other bugs still not ironed
> out when I previously tested it.
> 
> That simplifies things a lot on the Linux side. It may be that the
> bounce is still required if we implement it on POWER8 using ramming,
> but I'll get the POWER9 code in first.

Right, the bouncing is still required on P8 because we need to ram instructions
and you can only ram instructions if all threads on a core are quiesced.

- Alistair

>
> Thanks,
> Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET
  2017-09-12 16:05 ` [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET Nicholas Piggin
  2017-09-13 13:13   ` Nicholas Piggin
@ 2017-09-14 11:26   ` Nicholas Piggin
  1 sibling, 0 replies; 11+ messages in thread
From: Nicholas Piggin @ 2017-09-14 11:26 UTC (permalink / raw)
  To: linuxppc-dev, skiboot; +Cc: Benjamin Herrenschmidt, Alistair Popple

On Wed, 13 Sep 2017 02:05:53 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> There are two complications. The first is that sreset from stop states
> come in with SRR1 set to do a powersave wakeup, with an sreset reason
> encoded.
> 
> The second is that threads on the same core can't be signalled directly
> so we must designate a bounce CPU to reflect the IPI back.

This is a revised patch with only DD2 enablement. DD2 allows threads on
the same core to be IPIed. It's much simpler, and most of the code is
fixing the watchdog and preventing it from triggering from xmon (which
will be split into other patches of course).

It's probably a better starting point to get this working and merged
first, then revisiting bouncing.

---
 arch/powerpc/include/asm/opal-api.h            |  1 +
 arch/powerpc/include/asm/opal.h                |  2 ++
 arch/powerpc/kernel/irq.c                      | 20 ++++++++++++++++++
 arch/powerpc/kernel/watchdog.c                 | 29 +++++++++++++++-----------
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/powernv.h       |  1 +
 arch/powerpc/platforms/powernv/setup.c         |  3 +++
 arch/powerpc/platforms/powernv/smp.c           | 24 +++++++++++++++++++++
 arch/powerpc/xmon/xmon.c                       | 17 +++++++++++----
 9 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..9d191ebea706 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@
 #define OPAL_XIVE_DUMP				142
 #define OPAL_XIVE_RESERVED3			143
 #define OPAL_XIVE_RESERVED4			144
+#define OPAL_SIGNAL_SYSTEM_RESET		145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
 #define OPAL_NPU_MAP_LPAR			148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..7d7613c49f2b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+int64_t opal_signal_system_reset(int32_t cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..8ffebb9437e5 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -407,11 +407,31 @@ static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
+static noinline void replay_system_reset(void)
+{
+	struct pt_regs regs;
+
+	ppc_save_regs(&regs);
+
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
 	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
 
 	/*
+	 * 0100b SRR1 reason is system reset. Take it now,
+	 * which is immediately after registers are restored
+	 * from idle. It's an NMI, so interrupts needn't be
+	 * re-enabled.
+	 */
+	if (unlikely(idx == 4))
+		replay_system_reset();
+
+	/*
 	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
 	 * so this can be called unconditionally with srr1 wake reason.
 	 */
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..1fb9379dc683 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -97,8 +97,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	else
 		dump_stack();
 
-	if (hardlockup_panic)
-		nmi_panic(regs, "Hard LOCKUP");
+	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
 static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
@@ -134,15 +133,18 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 	pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n",
 			cpu, cpumask_pr_args(&wd_smp_cpus_pending));
 
-	/*
-	 * Try to trigger the stuck CPUs.
-	 */
-	for_each_cpu(c, &wd_smp_cpus_pending) {
-		if (c == cpu)
-			continue;
-		smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+	if (!sysctl_hardlockup_all_cpu_backtrace) {
+		/*
+		 * Try to trigger the stuck CPUs, unless we are going to
+		 * get a backtrace on all of them anyway.
+		 */
+		for_each_cpu(c, &wd_smp_cpus_pending) {
+			if (c == cpu)
+				continue;
+			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+		}
+		smp_flush_nmi_ipi(1000000);
 	}
-	smp_flush_nmi_ipi(1000000);
 
 	/* Take the stuck CPUs out of the watch group */
 	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
@@ -275,9 +277,12 @@ void arch_touch_nmi_watchdog(void)
 {
 	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
+	u64 tb = get_tb();
 
-	if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
-		watchdog_timer_interrupt(cpu);
+	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+		per_cpu(wd_timer_tb, cpu) = tb;
+		wd_smp_clear_cpu_pending(cpu, tb);
+	}
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index a159d48573d7..49add2037e0d 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_SMP
 extern void pnv_smp_init(void);
+extern int pnv_system_reset_exception(struct pt_regs *regs);
 #else
 static inline void pnv_smp_init(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..4fdaa1d7c4cd 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,9 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+#ifdef CONFIG_SMP
+	ppc_md.system_reset_exception = pnv_system_reset_exception;
+#endif
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..83343832e07e 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -290,6 +290,28 @@ static void __init pnv_smp_probe(void)
 	}
 }
 
+int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	rc = opal_signal_system_reset(cpu);
+	if (rc == OPAL_SUCCESS)
+		return 1;
+
+	/*
+	 * Don't cope with OPAL_PARTIAL yet (just punt to regular IPI)
+	 */
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +330,8 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET))
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 33351c6704b1..d9a12102b111 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 
  waiting:
 	secondary = 1;
+	spin_begin();
 	while (secondary && !xmon_gate) {
 		if (in_xmon == 0) {
-			if (fromipi)
+			if (fromipi) {
+				spin_end();
 				goto leave;
+			}
 			secondary = test_and_set_bit(0, &in_xmon);
 		}
-		barrier();
+		spin_cpu_relax();
+		touch_nmi_watchdog();
 	}
+	spin_end();
 
 	if (!secondary && !xmon_gate) {
 		/* we are the first cpu to come in */
@@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		mb();
 		xmon_gate = 1;
 		barrier();
+		touch_nmi_watchdog();
 	}
 
  cmdloop:
 	while (in_xmon) {
 		if (secondary) {
+			spin_begin();
 			if (cpu == xmon_owner) {
 				if (!test_and_set_bit(0, &xmon_taken)) {
 					secondary = 0;
+					spin_end();
 					continue;
 				}
 				/* missed it */
 				while (cpu == xmon_owner)
-					barrier();
+					spin_cpu_relax();
 			}
-			barrier();
+			spin_cpu_relax();
+			touch_nmi_watchdog();
 		} else {
 			cmd = cmds(regs);
 			if (cmd != 0) {
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2017-09-14 11:26 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-12 16:05 [RFC PATCH 0/2] NMI IPI work in progress for Linux and OPAL Nicholas Piggin
2017-09-12 16:05 ` [RFC PATCH 1/2] core: implement OPAL_SIGNAL_SYSTEM_RESET with POWER9 scoms Nicholas Piggin
2017-09-12 23:18   ` Benjamin Herrenschmidt
2017-09-13 13:27     ` Nicholas Piggin
2017-09-14  2:27       ` Benjamin Herrenschmidt
2017-09-12 16:05 ` [RFC PATCH 2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET Nicholas Piggin
2017-09-13 13:13   ` Nicholas Piggin
2017-09-14  2:24     ` Benjamin Herrenschmidt
2017-09-14  6:32       ` Nicholas Piggin
2017-09-14  6:43         ` Alistair Popple
2017-09-14 11:26   ` Nicholas Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).