linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
@ 2020-04-13  3:14 CodyYao-oc
  2020-04-15  9:36 ` Borislav Petkov
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-04-13  3:14 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, tglx, bp, x86, hpa, linux-kernel
  Cc: cooperyan, codyyao, CodyYao-oc

Zhaoxin CPU has provided facilities for monitoring performance
via PMU(Performance Monitor Unit), but the functionality is unused so far.
Therefore, add support for zhaoxin pmu to make performance related
hardware events available.

Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
Reported-by: kbuild test robot <lkp@intel.com>
---
 arch/x86/events/Makefile               |   2 +
 arch/x86/events/core.c                 |   4 +
 arch/x86/events/perf_event.h           |   9 +
 arch/x86/events/zhaoxin/Makefile       |   2 +
 arch/x86/events/zhaoxin/core.c         | 625 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perfctr-watchdog.c |   6 +
 6 files changed, 648 insertions(+)
 create mode 100644 arch/x86/events/zhaoxin/Makefile
 create mode 100644 arch/x86/events/zhaoxin/core.c

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 9e07f55..6f1d1fd 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
 obj-y					+= amd/
 obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
+obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a619763..9e63ee5 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void)
 		err = amd_pmu_init();
 		x86_pmu.name = "HYGON";
 		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		err = zhaoxin_pmu_init();
+		break;
 	default:
 		err = -ENOTSUPP;
 	}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f1cd1ca..f6bbdca 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1133,3 +1133,12 @@ static inline int is_ht_workaround_enabled(void)
 	return 0;
 }
 #endif /* CONFIG_CPU_SUP_INTEL */
+
+#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
+int zhaoxin_pmu_init(void);
+#else
+static inline int zhaoxin_pmu_init(void)
+{
+	return 0;
+}
+#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile
new file mode 100644
index 0000000..642c1174
--- /dev/null
+++ b/arch/x86/events/zhaoxin/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y	+= core.o
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
new file mode 100644
index 0000000..77ed8a6
--- /dev/null
+++ b/arch/x86/events/zhaoxin/core.c
@@ -0,0 +1,625 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Per cpu state
+ *
+ * Used to coordinate shared registers among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Zhaoxin PerfMon, used on zxc and later.
+ */
+static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
+
+	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
+	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
+	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
+	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
+};
+
+static struct event_constraint zxc_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint zxd_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static __initconst const u64 zxd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x0538,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0700,
+		[C(RESULT_MISS)] = 0x0709,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static __initconst const u64 zxe_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x054b,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0028,
+		[C(RESULT_MISS)] = 0x0029,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static void zhaoxin_pmu_disable_all(void)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void zhaoxin_pmu_enable_all(int added)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static inline u64 zhaoxin_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void zhaoxin_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void zxc_pmu_ack_status(u64 ack)
+{
+	/*
+	 * ZXC needs global control enabled in order to clear status bits.
+	 */
+	zhaoxin_pmu_enable_all(0);
+	zhaoxin_pmu_ack_status(ack);
+	zhaoxin_pmu_disable_all();
+}
+
+static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+}
+
+static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int bit;
+	u64 status;
+	bool is_zxc = false;
+	int handled = 0;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	zhaoxin_pmu_disable_all();
+	status = zhaoxin_pmu_get_status();
+	if (!status)
+		goto done;
+
+	if (boot_cpu_data.x86 == 0x06 &&
+		(boot_cpu_data.x86_model == 0x0f ||
+			boot_cpu_data.x86_model == 0x19))
+		is_zxc = true;
+again:
+
+	/*Clearing status works only if the global control is enable on zxc.*/
+	if (is_zxc)
+		zxc_pmu_ack_status(status);
+	else
+		zhaoxin_pmu_ack_status(status);
+
+	inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
+	 * and clear the bit.
+	 */
+	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+		if (!status)
+			goto done;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		x86_perf_event_update(event);
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = zhaoxin_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	zhaoxin_pmu_enable_all(0);
+	return handled;
+}
+
+static u64 zhaoxin_pmu_event_map(int hw_event)
+{
+	return zx_pmon_event_map[hw_event];
+}
+
+static struct event_constraint *
+zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &unconstrained;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7");
+PMU_FORMAT_ATTR(umask,	"config:8-15");
+PMU_FORMAT_ATTR(edge,	"config:18");
+PMU_FORMAT_ATTR(inv,	"config:23");
+PMU_FORMAT_ATTR(cmask,	"config:24-31");
+
+static struct attribute *zx_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static const struct x86_pmu zhaoxin_pmu __initconst = {
+	.name			= "zhaoxin",
+	.handle_irq		= zhaoxin_pmu_handle_irq,
+	.disable_all		= zhaoxin_pmu_disable_all,
+	.enable_all		= zhaoxin_pmu_enable_all,
+	.enable			= zhaoxin_pmu_enable_event,
+	.disable		= zhaoxin_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= zhaoxin_pmu_event_map,
+	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
+	.apic			= 1,
+	/*
+	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
+	 */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= zhaoxin_get_event_constraints,
+
+	.format_attrs		= zx_arch_formats_attr,
+	.events_sysfs_show	= zhaoxin_event_sysfs_show,
+};
+
+static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void zhaoxin_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask,
+			ARRAY_SIZE(zx_arch_events_map)) {
+
+		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+				zx_arch_events_map[bit].name);
+	}
+}
+
+__init int zhaoxin_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	int version;
+
+	pr_info("Welcome to zhaoxin pmu!\n");
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version == 2) {
+		x86_pmu = zhaoxin_pmu;
+		pr_info("Version check pass!\n");
+	} else
+		return -ENODEV;
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_add_quirk(zhaoxin_arch_events_quirk);
+
+	switch (boot_cpu_data.x86) {
+	case 0x06:
+		if (boot_cpu_data.x86_model == 0x0f ||
+			boot_cpu_data.x86_model == 0x19) {
+
+			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+
+			x86_pmu.event_constraints = zxc_event_constraints;
+			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
+
+			pr_cont("ZXC events, ");
+		} else
+			return -ENODEV;
+		break;
+	case 0x07:
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+
+		switch (boot_cpu_data.x86_model) {
+		case 0x1b:
+			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
+				sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
+				= 0x0700;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
+				= 0x0709;
+
+			pr_cont("ZXD events, ");
+			break;
+		case 0x3b:
+			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
+				sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
+				= 0x0028;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
+				= 0x0029;
+
+			pr_cont("ZXE events, ");
+			break;
+		default:
+			return -ENODEV;
+		}
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
+	x86_pmu.intel_ctrl |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
+		}
+	}
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9556930..63a5828 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,9 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BPU_PERFCTR0;
 		}
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_PERFCTR0;
 	}
 	return 0;
 }
@@ -92,6 +95,9 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BSU_ESCR0;
 		}
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
 	}
 	return 0;
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-13  3:14 [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU CodyYao-oc
@ 2020-04-15  9:36 ` Borislav Petkov
  2020-04-16  6:16   ` CodyYao-oc
  2020-04-15 10:23 ` Peter Zijlstra
  2020-05-01 18:22 ` [tip: perf/core] " tip-bot2 for CodyYao-oc
  2 siblings, 1 reply; 15+ messages in thread
From: Borislav Petkov @ 2020-04-15  9:36 UTC (permalink / raw)
  To: CodyYao-oc
  Cc: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, tglx, x86, hpa, linux-kernel, cooperyan, codyyao

On Mon, Apr 13, 2020 at 11:14:29AM +0800, CodyYao-oc wrote:
> Zhaoxin CPU has provided facilities for monitoring performance
> via PMU(Performance Monitor Unit), but the functionality is unused so far.
> Therefore, add support for zhaoxin pmu to make performance related
> hardware events available.
> 
> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> Reported-by: kbuild test robot <lkp@intel.com>

What exactly did the 0day bot report?

Put that in []

above the Reported-by line pls.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-13  3:14 [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU CodyYao-oc
  2020-04-15  9:36 ` Borislav Petkov
@ 2020-04-15 10:23 ` Peter Zijlstra
  2020-04-15 10:31   ` Peter Zijlstra
  2020-04-16  6:26   ` CodyYao-oc
  2020-05-01 18:22 ` [tip: perf/core] " tip-bot2 for CodyYao-oc
  2 siblings, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-04-15 10:23 UTC (permalink / raw)
  To: CodyYao-oc
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	tglx, bp, x86, hpa, linux-kernel, cooperyan, codyyao

On Mon, Apr 13, 2020 at 11:14:29AM +0800, CodyYao-oc wrote:
> Zhaoxin CPU has provided facilities for monitoring performance
> via PMU(Performance Monitor Unit), but the functionality is unused so far.
> Therefore, add support for zhaoxin pmu to make performance related
> hardware events available.
> 
> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> Reported-by: kbuild test robot <lkp@intel.com>

What's that reported-by thing? Did the robot complain you didn't have a
PMU implementation?

Anyway, I've made the below changes to the patch.

---
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -618,6 +618,7 @@ struct x86_pmu {
 
 	/* PMI handler bits */
 	unsigned int	late_ack		:1,
+			enabled_ack		:1,
 			counter_freezing	:1;
 	/*
 	 * sysfs attrs
--- a/arch/x86/events/zhaoxin/core.c
+++ b/arch/x86/events/zhaoxin/core.c
@@ -357,10 +357,9 @@ static int zhaoxin_pmu_handle_irq(struct
 {
 	struct perf_sample_data data;
 	struct cpu_hw_events *cpuc;
-	int bit;
-	u64 status;
-	bool is_zxc = false;
 	int handled = 0;
+	u64 status;
+	int bit;
 
 	cpuc = this_cpu_ptr(&cpu_hw_events);
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
@@ -369,14 +368,8 @@ static int zhaoxin_pmu_handle_irq(struct
 	if (!status)
 		goto done;
 
-	if (boot_cpu_data.x86 == 0x06 &&
-		(boot_cpu_data.x86_model == 0x0f ||
-			boot_cpu_data.x86_model == 0x19))
-		is_zxc = true;
 again:
-
-	/*Clearing status works only if the global control is enable on zxc.*/
-	if (is_zxc)
+	if (x86_pmu.enabled_ack)
 		zxc_pmu_ack_status(status);
 	else
 		zhaoxin_pmu_ack_status(status);
@@ -504,12 +497,10 @@ static __init void zhaoxin_arch_events_q
 	int bit;
 
 	/* disable event that reported as not presend by cpuid */
-	for_each_set_bit(bit, x86_pmu.events_mask,
-			ARRAY_SIZE(zx_arch_events_map)) {
-
+	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
 		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
 		pr_warn("CPUID marked event: \'%s\' unavailable\n",
-				zx_arch_events_map[bit].name);
+			zx_arch_events_map[bit].name);
 	}
 }
 
@@ -534,12 +525,12 @@ __init int zhaoxin_pmu_init(void)
 		return -ENODEV;
 
 	version = eax.split.version_id;
-	if (version == 2) {
-		x86_pmu = zhaoxin_pmu;
-		pr_info("Version check pass!\n");
-	} else
+	if (version != 2)
 		return -ENODEV;
 
+	x86_pmu = zhaoxin_pmu;
+	pr_info("Version check pass!\n");
+
 	x86_pmu.version			= version;
 	x86_pmu.num_counters		= eax.split.num_counters;
 	x86_pmu.cntval_bits		= eax.split.bit_width;
@@ -552,11 +543,13 @@ __init int zhaoxin_pmu_init(void)
 
 	switch (boot_cpu_data.x86) {
 	case 0x06:
-		if (boot_cpu_data.x86_model == 0x0f ||
-			boot_cpu_data.x86_model == 0x19) {
+		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
 
 			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
 
+			/* Clearing status works only if the global control is enable on zxc. */
+			x86_pmu.enabled_ack = 1;
+
 			x86_pmu.event_constraints = zxc_event_constraints;
 			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
 			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
@@ -564,40 +557,37 @@ __init int zhaoxin_pmu_init(void)
 			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
 
 			pr_cont("ZXC events, ");
-		} else
-			return -ENODEV;
-		break;
+			break;
+		}
+		return -ENODEV;
+
 	case 0x07:
 		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-		X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
 
 		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-		X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
 
 		switch (boot_cpu_data.x86_model) {
 		case 0x1b:
 			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
-				sizeof(hw_cache_event_ids));
+			       sizeof(hw_cache_event_ids));
 
 			x86_pmu.event_constraints = zxd_event_constraints;
 
-			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
-				= 0x0700;
-			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
-				= 0x0709;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
 
 			pr_cont("ZXD events, ");
 			break;
 		case 0x3b:
 			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
-				sizeof(hw_cache_event_ids));
+			       sizeof(hw_cache_event_ids));
 
 			x86_pmu.event_constraints = zxd_event_constraints;
 
-			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
-				= 0x0028;
-			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
-				= 0x0029;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
 
 			pr_cont("ZXE events, ");
 			break;
@@ -605,13 +595,13 @@ __init int zhaoxin_pmu_init(void)
 			return -ENODEV;
 		}
 		break;
+
 	default:
 		return -ENODEV;
 	}
 
 	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
-	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,7 @@ static inline unsigned int nmi_perfctr_m
 		case 15:
 			return msr - MSR_P4_BPU_PERFCTR0;
 		}
+		fallthrough;
 	case X86_VENDOR_ZHAOXIN:
 	case X86_VENDOR_CENTAUR:
 		return msr - MSR_ARCH_PERFMON_PERFCTR0;
@@ -95,6 +96,7 @@ static inline unsigned int nmi_evntsel_m
 		case 15:
 			return msr - MSR_P4_BSU_ESCR0;
 		}
+		fallthrough;
 	case X86_VENDOR_ZHAOXIN:
 	case X86_VENDOR_CENTAUR:
 		return msr - MSR_ARCH_PERFMON_EVENTSEL0;

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-15 10:23 ` Peter Zijlstra
@ 2020-04-15 10:31   ` Peter Zijlstra
  2020-04-16  7:36     ` CodyYao-oc
  2020-05-06 12:55     ` CodyYao-oc
  2020-04-16  6:26   ` CodyYao-oc
  1 sibling, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2020-04-15 10:31 UTC (permalink / raw)
  To: CodyYao-oc
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	tglx, bp, x86, hpa, linux-kernel, cooperyan, codyyao

On Wed, Apr 15, 2020 at 12:23:40PM +0200, Peter Zijlstra wrote:
> On Mon, Apr 13, 2020 at 11:14:29AM +0800, CodyYao-oc wrote:
> > Zhaoxin CPU has provided facilities for monitoring performance
> > via PMU(Performance Monitor Unit), but the functionality is unused so far.
> > Therefore, add support for zhaoxin pmu to make performance related
> > hardware events available.
> > 
> > Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> > Reported-by: kbuild test robot <lkp@intel.com>
> 
> What's that reported-by thing? Did the robot complain you didn't have a
> PMU implementation?
> 
> Anyway, I've made the below changes to the patch.

In whole, the patch now looks like this. I've also added that event
table you provided last time to the Changelog.

---
Subject: x86/perf: Add hardware performance events support for Zhaoxin CPU.
From: CodyYao-oc <CodyYao-oc@zhaoxin.com>
Date: Mon, 13 Apr 2020 11:14:29 +0800

From: CodyYao-oc <CodyYao-oc@zhaoxin.com>

Zhaoxin CPU has provided facilities for monitoring performance
via PMU (Performance Monitor Unit), but the functionality is unused so far.
Therefore, add support for zhaoxin pmu to make performance related
hardware events available.

The PMU is mostly an Intel Architectural PerfMon-v2 with a novel
errata for the ZXC line. It supports the following events:

  -----------------------------------------------------------------------------------------------------------------------------------
  Event                      | Event  | Umask |          Description
			     | Select |       |
  -----------------------------------------------------------------------------------------------------------------------------------
  cpu-cycles                 |  82h   |  00h  | unhalt core clock
  instructions               |  00h   |  00h  | number of instructions at retirement.
  cache-references           |  15h   |  05h  | number of fillq pushs at the current cycle.
  cache-misses               |  1ah   |  05h  | number of l2 miss pushed by fillq.
  branch-instructions        |  28h   |  00h  | counts the number of branch instructions retired.
  branch-misses              |  29h   |  00h  | mispredicted branch instructions at retirement.
  bus-cycles                 |  83h   |  00h  | unhalt bus clock
  stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the # of Uops issued by the RAT to RS.
  stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
  L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
  L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose data source followed an L1 miss.
  L1-dcache-stores           |  69h   |  06h  | number of retire/commit Store,no LEA
  L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.
  L1-icache-loads            |  00h   |  03h  | number of l1i cache access for valid normal fetch,including un-cacheable access.
  L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss for valid normal fetch,including un-cacheable miss.
  L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
  L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
  dTLB-loads                 |  68h   |  05h  | number of retire/commit load
  dTLB-load-misses           |  2ch   |  05h  | number of load operations miss all level tlbs and cause a tablewalk.
  dTLB-stores                |  69h   |  06h  | number of retire/commit Store,no LEA
  dTLB-store-misses          |  30h   |  05h  | number of store operations miss all level tlbs and cause a tablewalk.
  dTLB-prefetches            |  64h   |  05h  | number of hardware pte prefetch requests dispatched out of the prefetch FIFO.
  dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte prefetch requests miss the l1d data cache.
  iTLB-load                  |  00h   |  00h  | actually counter instructions.
  iTLB-load-misses           |  34h   |  05h  | number of code operations miss all level tlbs and cause a tablewalk.
  -----------------------------------------------------------------------------------------------------------------------------------

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1586747669-4827-1-git-send-email-CodyYao-oc@zhaoxin.com
---
 arch/x86/events/Makefile               |    2 
 arch/x86/events/core.c                 |    4 
 arch/x86/events/perf_event.h           |   10 
 arch/x86/events/zhaoxin/Makefile       |    2 
 arch/x86/events/zhaoxin/core.c         |  613 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perfctr-watchdog.c |    8 
 6 files changed, 639 insertions(+)
 create mode 100644 arch/x86/events/zhaoxin/Makefile
 create mode 100644 arch/x86/events/zhaoxin/core.c

--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
 obj-y					+= amd/
 obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
+obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(vo
 		err = amd_pmu_init();
 		x86_pmu.name = "HYGON";
 		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		err = zhaoxin_pmu_init();
+		break;
 	default:
 		err = -ENOTSUPP;
 	}
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -618,6 +618,7 @@ struct x86_pmu {
 
 	/* PMI handler bits */
 	unsigned int	late_ack		:1,
+			enabled_ack		:1,
 			counter_freezing	:1;
 	/*
 	 * sysfs attrs
@@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabl
 	return 0;
 }
 #endif /* CONFIG_CPU_SUP_INTEL */
+
+#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
+int zhaoxin_pmu_init(void);
+#else
+static inline int zhaoxin_pmu_init(void)
+{
+	return 0;
+}
+#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
--- /dev/null
+++ b/arch/x86/events/zhaoxin/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y	+= core.o
--- /dev/null
+++ b/arch/x86/events/zhaoxin/core.c
@@ -0,0 +1,613 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Zhoaxin PMU; like Intel Architectural PerfMon-v2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Zhaoxin PerfMon, used on zxc and later.
+ */
+static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
+
+	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
+	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
+	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
+	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
+};
+
+static struct event_constraint zxc_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint zxd_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static __initconst const u64 zxd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x0538,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0700,
+		[C(RESULT_MISS)] = 0x0709,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static __initconst const u64 zxe_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x054b,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0028,
+		[C(RESULT_MISS)] = 0x0029,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static void zhaoxin_pmu_disable_all(void)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void zhaoxin_pmu_enable_all(int added)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static inline u64 zhaoxin_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void zhaoxin_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void zxc_pmu_ack_status(u64 ack)
+{
+	/*
+	 * ZXC needs global control enabled in order to clear status bits.
+	 */
+	zhaoxin_pmu_enable_all(0);
+	zhaoxin_pmu_ack_status(ack);
+	zhaoxin_pmu_disable_all();
+}
+
+static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+}
+
+static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int handled = 0;
+	u64 status;
+	int bit;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	zhaoxin_pmu_disable_all();
+	status = zhaoxin_pmu_get_status();
+	if (!status)
+		goto done;
+
+again:
+	if (x86_pmu.enabled_ack)
+		zxc_pmu_ack_status(status);
+	else
+		zhaoxin_pmu_ack_status(status);
+
+	inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
+	 * and clear the bit.
+	 */
+	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+		if (!status)
+			goto done;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		x86_perf_event_update(event);
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = zhaoxin_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	zhaoxin_pmu_enable_all(0);
+	return handled;
+}
+
+static u64 zhaoxin_pmu_event_map(int hw_event)
+{
+	return zx_pmon_event_map[hw_event];
+}
+
+static struct event_constraint *
+zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &unconstrained;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7");
+PMU_FORMAT_ATTR(umask,	"config:8-15");
+PMU_FORMAT_ATTR(edge,	"config:18");
+PMU_FORMAT_ATTR(inv,	"config:23");
+PMU_FORMAT_ATTR(cmask,	"config:24-31");
+
+static struct attribute *zx_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static const struct x86_pmu zhaoxin_pmu __initconst = {
+	.name			= "zhaoxin",
+	.handle_irq		= zhaoxin_pmu_handle_irq,
+	.disable_all		= zhaoxin_pmu_disable_all,
+	.enable_all		= zhaoxin_pmu_enable_all,
+	.enable			= zhaoxin_pmu_enable_event,
+	.disable		= zhaoxin_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= zhaoxin_pmu_event_map,
+	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
+	.apic			= 1,
+	/*
+	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
+	 */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= zhaoxin_get_event_constraints,
+
+	.format_attrs		= zx_arch_formats_attr,
+	.events_sysfs_show	= zhaoxin_event_sysfs_show,
+};
+
+static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void zhaoxin_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
+		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+			zx_arch_events_map[bit].name);
+	}
+}
+
+__init int zhaoxin_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	int version;
+
+	pr_info("Welcome to zhaoxin pmu!\n");
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version != 2)
+		return -ENODEV;
+
+	x86_pmu = zhaoxin_pmu;
+	pr_info("Version check pass!\n");
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_add_quirk(zhaoxin_arch_events_quirk);
+
+	switch (boot_cpu_data.x86) {
+	case 0x06:
+		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
+
+			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+
+			/* Clearing status works only if the global control is enable on zxc. */
+			x86_pmu.enabled_ack = 1;
+
+			x86_pmu.event_constraints = zxc_event_constraints;
+			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
+
+			pr_cont("ZXC events, ");
+			break;
+		}
+		return -ENODEV;
+
+	case 0x07:
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+
+		switch (boot_cpu_data.x86_model) {
+		case 0x1b:
+			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
+
+			pr_cont("ZXD events, ");
+			break;
+		case 0x3b:
+			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
+
+			pr_cont("ZXE events, ");
+			break;
+		default:
+			return -ENODEV;
+		}
+		break;
+
+	default:
+		return -ENODEV;
+	}
+
+	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
+	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
+		}
+	}
+
+	return 0;
+}
+
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_m
 		case 15:
 			return msr - MSR_P4_BPU_PERFCTR0;
 		}
+		fallthrough;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_PERFCTR0;
 	}
 	return 0;
 }
@@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_m
 		case 15:
 			return msr - MSR_P4_BSU_ESCR0;
 		}
+		fallthrough;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
 	}
 	return 0;
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-15  9:36 ` Borislav Petkov
@ 2020-04-16  6:16   ` CodyYao-oc
  0 siblings, 0 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-04-16  6:16 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, tglx, x86, hpa, linux-kernel, cooperyan, codyyao

On 2020/4/15 下午5:36, Borislav Petkov wrote:
> On Mon, Apr 13, 2020 at 11:14:29AM +0800, CodyYao-oc wrote:
>> Zhaoxin CPU has provided facilities for monitoring performance
>> via PMU(Performance Monitor Unit), but the functionality is unused so far.
>> Therefore, add support for zhaoxin pmu to make performance related
>> hardware events available.
>>
>> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
>> Reported-by: kbuild test robot <lkp@intel.com>
> 
> What exactly did the 0day bot report?
> 
> Put that in []
> 
> above the Reported-by line pls.
> 

Dear Boris,

It's a warning message about uninitialized variable, paste the log 
below, sorry for missing it.

Furthermore, it will disappear on next version patch because of the code 
changes, anyway I'll paste it when resend next version patch.

[All warnings (new ones prefixed by >>):

 >> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is 
used uninitialized whenever 'if' condition is false 
[-Wsometimes-uninitialized]
            if (boot_cpu_data.x86 == 0x06 &&
                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
    arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs 
here
            if (is_zxc)
                ^~~~~~
    arch/x86/events/zhaoxin/core.c:362:2: note: remove the 'if' if its 
condition is always true
            if (boot_cpu_data.x86 == 0x06 &&
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 >> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is 
used uninitialized whenever '&&' condition is false 
[-Wsometimes-uninitialized]
            if (boot_cpu_data.x86 == 0x06 &&
                ^~~~~~~~~~~~~~~~~~~~~~~~~
    arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs 
here
            if (is_zxc)
                ^~~~~~
    arch/x86/events/zhaoxin/core.c:362:6: note: remove the '&&' if its 
condition is always true
            if (boot_cpu_data.x86 == 0x06 &&
                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
    arch/x86/events/zhaoxin/core.c:352:13: note: initialize the variable 
'is_zxc' to silence this warning
            bool is_zxc;
                       ^
                        = 0
    2 warnings generated.]

Reported-by:kbuild test robot<lkp@intel.com>

Thanks
Cody

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-15 10:23 ` Peter Zijlstra
  2020-04-15 10:31   ` Peter Zijlstra
@ 2020-04-16  6:26   ` CodyYao-oc
  1 sibling, 0 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-04-16  6:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	tglx, bp, x86, hpa, linux-kernel, cooperyan, codyyao

On 2020/4/15 下午6:23, Peter Zijlstra wrote:
> On Mon, Apr 13, 2020 at 11:14:29AM +0800, CodyYao-oc wrote:
>> Zhaoxin CPU has provided facilities for monitoring performance
>> via PMU(Performance Monitor Unit), but the functionality is unused so far.
>> Therefore, add support for zhaoxin pmu to make performance related
>> hardware events available.
>>
>> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
>> Reported-by: kbuild test robot <lkp@intel.com>
> 
> What's that reported-by thing? Did the robot complain you didn't have a
> PMU implementation?
>
Hi Peter, it's a warning message about uninitialized variable, paste the 
log below, sorry for miss it. Futhermore, it will disappear base on your 
newly modified patch. Thanks!

[All warnings (new ones prefixed by >>):

 >> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is 
used uninitialized whenever 'if' condition is false 
[-Wsometimes-uninitialized]
            if (boot_cpu_data.x86 == 0x06 &&
                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
    arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs 
here
            if (is_zxc)
                ^~~~~~
    arch/x86/events/zhaoxin/core.c:362:2: note: remove the 'if' if its 
condition is always true
            if (boot_cpu_data.x86 == 0x06 &&
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 >> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is 
used uninitialized whenever '&&' condition is false 
[-Wsometimes-uninitialized]
            if (boot_cpu_data.x86 == 0x06 &&
                ^~~~~~~~~~~~~~~~~~~~~~~~~
    arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs 
here
            if (is_zxc)
                ^~~~~~
    arch/x86/events/zhaoxin/core.c:362:6: note: remove the '&&' if its 
condition is always true
            if (boot_cpu_data.x86 == 0x06 &&
                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
    arch/x86/events/zhaoxin/core.c:352:13: note: initialize the variable 
'is_zxc' to silence this warning
            bool is_zxc;
                       ^
                        = 0
    2 warnings generated.]

Reported-by: kbuild test rebot<lkp@intel.com>

> Anyway, I've made the below changes to the patch.
> 
> ---
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -618,6 +618,7 @@ struct x86_pmu {
>   
>   	/* PMI handler bits */
>   	unsigned int	late_ack		:1,
> +			enabled_ack		:1,
>   			counter_freezing	:1;
>   	/*
>   	 * sysfs attrs
> --- a/arch/x86/events/zhaoxin/core.c
> +++ b/arch/x86/events/zhaoxin/core.c
> @@ -357,10 +357,9 @@ static int zhaoxin_pmu_handle_irq(struct
>   {
>   	struct perf_sample_data data;
>   	struct cpu_hw_events *cpuc;
> -	int bit;
> -	u64 status;
> -	bool is_zxc = false;
>   	int handled = 0;
> +	u64 status;
> +	int bit;
>   
>   	cpuc = this_cpu_ptr(&cpu_hw_events);
>   	apic_write(APIC_LVTPC, APIC_DM_NMI);
> @@ -369,14 +368,8 @@ static int zhaoxin_pmu_handle_irq(struct
>   	if (!status)
>   		goto done;
>   
> -	if (boot_cpu_data.x86 == 0x06 &&
> -		(boot_cpu_data.x86_model == 0x0f ||
> -			boot_cpu_data.x86_model == 0x19))
> -		is_zxc = true;
>   again:
> -
> -	/*Clearing status works only if the global control is enable on zxc.*/
> -	if (is_zxc)
> +	if (x86_pmu.enabled_ack)
>   		zxc_pmu_ack_status(status);
>   	else
>   		zhaoxin_pmu_ack_status(status);
> @@ -504,12 +497,10 @@ static __init void zhaoxin_arch_events_q
>   	int bit;
>   
>   	/* disable event that reported as not presend by cpuid */
> -	for_each_set_bit(bit, x86_pmu.events_mask,
> -			ARRAY_SIZE(zx_arch_events_map)) {
> -
> +	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
>   		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
>   		pr_warn("CPUID marked event: \'%s\' unavailable\n",
> -				zx_arch_events_map[bit].name);
> +			zx_arch_events_map[bit].name);
>   	}
>   }
>   
> @@ -534,12 +525,12 @@ __init int zhaoxin_pmu_init(void)
>   		return -ENODEV;
>   
>   	version = eax.split.version_id;
> -	if (version == 2) {
> -		x86_pmu = zhaoxin_pmu;
> -		pr_info("Version check pass!\n");
> -	} else
> +	if (version != 2)
>   		return -ENODEV;
>   
> +	x86_pmu = zhaoxin_pmu;
> +	pr_info("Version check pass!\n");
> +
>   	x86_pmu.version			= version;
>   	x86_pmu.num_counters		= eax.split.num_counters;
>   	x86_pmu.cntval_bits		= eax.split.bit_width;
> @@ -552,11 +543,13 @@ __init int zhaoxin_pmu_init(void)
>   
>   	switch (boot_cpu_data.x86) {
>   	case 0x06:
> -		if (boot_cpu_data.x86_model == 0x0f ||
> -			boot_cpu_data.x86_model == 0x19) {
> +		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
>   
>   			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
>   
> +			/* Clearing status works only if the global control is enable on zxc. */
> +			x86_pmu.enabled_ack = 1;
> +
>   			x86_pmu.event_constraints = zxc_event_constraints;
>   			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
>   			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
> @@ -564,40 +557,37 @@ __init int zhaoxin_pmu_init(void)
>   			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
>   
>   			pr_cont("ZXC events, ");
> -		} else
> -			return -ENODEV;
> -		break;
> +			break;
> +		}
> +		return -ENODEV;
> +
>   	case 0x07:
>   		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
> -		X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
> +			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
>   
>   		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
> -		X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
> +			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
>   
>   		switch (boot_cpu_data.x86_model) {
>   		case 0x1b:
>   			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
> -				sizeof(hw_cache_event_ids));
> +			       sizeof(hw_cache_event_ids));
>   
>   			x86_pmu.event_constraints = zxd_event_constraints;
>   
> -			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
> -				= 0x0700;
> -			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
> -				= 0x0709;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
>   
>   			pr_cont("ZXD events, ");
>   			break;
>   		case 0x3b:
>   			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
> -				sizeof(hw_cache_event_ids));
> +			       sizeof(hw_cache_event_ids));
>   
>   			x86_pmu.event_constraints = zxd_event_constraints;
>   
> -			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
> -				= 0x0028;
> -			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
> -				= 0x0029;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
>   
>   			pr_cont("ZXE events, ");
>   			break;
> @@ -605,13 +595,13 @@ __init int zhaoxin_pmu_init(void)
>   			return -ENODEV;
>   		}
>   		break;
> +
>   	default:
>   		return -ENODEV;
>   	}
>   
>   	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
> -	x86_pmu.intel_ctrl |=
> -		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
> +	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
>   
>   	if (x86_pmu.event_constraints) {
>   		for_each_event_constraint(c, x86_pmu.event_constraints) {
> --- a/arch/x86/kernel/cpu/perfctr-watchdog.c
> +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
> @@ -63,6 +63,7 @@ static inline unsigned int nmi_perfctr_m
>   		case 15:
>   			return msr - MSR_P4_BPU_PERFCTR0;
>   		}
> +		fallthrough;
>   	case X86_VENDOR_ZHAOXIN:
>   	case X86_VENDOR_CENTAUR:
>   		return msr - MSR_ARCH_PERFMON_PERFCTR0;
> @@ -95,6 +96,7 @@ static inline unsigned int nmi_evntsel_m
>   		case 15:
>   			return msr - MSR_P4_BSU_ESCR0;
>   		}
> +		fallthrough;
>   	case X86_VENDOR_ZHAOXIN:
>   	case X86_VENDOR_CENTAUR:
>   		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
> .
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-15 10:31   ` Peter Zijlstra
@ 2020-04-16  7:36     ` CodyYao-oc
  2020-05-06 12:55     ` CodyYao-oc
  1 sibling, 0 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-04-16  7:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	tglx, bp, x86, hpa, linux-kernel, cooperyan, codyyao

On 2020/4/15 下午6:31, Peter Zijlstra wrote:
> On Wed, Apr 15, 2020 at 12:23:40PM +0200, Peter Zijlstra wrote:
>> On Mon, Apr 13, 2020 at 11:14:29AM +0800, CodyYao-oc wrote:
>>> Zhaoxin CPU has provided facilities for monitoring performance
>>> via PMU(Performance Monitor Unit), but the functionality is unused so far.
>>> Therefore, add support for zhaoxin pmu to make performance related
>>> hardware events available.
>>>
>>> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
>>> Reported-by: kbuild test robot <lkp@intel.com>
>>
>> What's that reported-by thing? Did the robot complain you didn't have a
>> PMU implementation?
>>
>> Anyway, I've made the below changes to the patch.
> 
> In whole, the patch now looks like this. I've also added that event
> table you provided last time to the Changelog.
> 

The code is really cleaner and the warning for uninitalized variable 
also disappear, many thanks. I will resend it as updated patch version, 
and paste the warning log about the last patch from kbuild test robot.

Thanks
Cody

> ---
> Subject: x86/perf: Add hardware performance events support for Zhaoxin CPU.
> From: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> Date: Mon, 13 Apr 2020 11:14:29 +0800
> 
> From: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> 
> Zhaoxin CPU has provided facilities for monitoring performance
> via PMU (Performance Monitor Unit), but the functionality is unused so far.
> Therefore, add support for zhaoxin pmu to make performance related
> hardware events available.
> 
> The PMU is mostly an Intel Architectural PerfMon-v2 with a novel
> errata for the ZXC line. It supports the following events:
> 
>    -----------------------------------------------------------------------------------------------------------------------------------
>    Event                      | Event  | Umask |          Description
> 			     | Select |       |
>    -----------------------------------------------------------------------------------------------------------------------------------
>    cpu-cycles                 |  82h   |  00h  | unhalt core clock
>    instructions               |  00h   |  00h  | number of instructions at retirement.
>    cache-references           |  15h   |  05h  | number of fillq pushs at the current cycle.
>    cache-misses               |  1ah   |  05h  | number of l2 miss pushed by fillq.
>    branch-instructions        |  28h   |  00h  | counts the number of branch instructions retired.
>    branch-misses              |  29h   |  00h  | mispredicted branch instructions at retirement.
>    bus-cycles                 |  83h   |  00h  | unhalt bus clock
>    stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the # of Uops issued by the RAT to RS.
>    stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
>    L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
>    L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose data source followed an L1 miss.
>    L1-dcache-stores           |  69h   |  06h  | number of retire/commit Store,no LEA
>    L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.
>    L1-icache-loads            |  00h   |  03h  | number of l1i cache access for valid normal fetch,including un-cacheable access.
>    L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss for valid normal fetch,including un-cacheable miss.
>    L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
>    L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
>    dTLB-loads                 |  68h   |  05h  | number of retire/commit load
>    dTLB-load-misses           |  2ch   |  05h  | number of load operations miss all level tlbs and cause a tablewalk.
>    dTLB-stores                |  69h   |  06h  | number of retire/commit Store,no LEA
>    dTLB-store-misses          |  30h   |  05h  | number of store operations miss all level tlbs and cause a tablewalk.
>    dTLB-prefetches            |  64h   |  05h  | number of hardware pte prefetch requests dispatched out of the prefetch FIFO.
>    dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte prefetch requests miss the l1d data cache.
>    iTLB-load                  |  00h   |  00h  | actually counter instructions.
>    iTLB-load-misses           |  34h   |  05h  | number of code operations miss all level tlbs and cause a tablewalk.
>    -----------------------------------------------------------------------------------------------------------------------------------
> 
> Reported-by: kbuild test robot <lkp@intel.com>
> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lkml.kernel.org/r/1586747669-4827-1-git-send-email-CodyYao-oc@zhaoxin.com
> ---
>   arch/x86/events/Makefile               |    2
>   arch/x86/events/core.c                 |    4
>   arch/x86/events/perf_event.h           |   10
>   arch/x86/events/zhaoxin/Makefile       |    2
>   arch/x86/events/zhaoxin/core.c         |  613 +++++++++++++++++++++++++++++++++
>   arch/x86/kernel/cpu/perfctr-watchdog.c |    8
>   6 files changed, 639 insertions(+)
>   create mode 100644 arch/x86/events/zhaoxin/Makefile
>   create mode 100644 arch/x86/events/zhaoxin/core.c
> 
> --- a/arch/x86/events/Makefile
> +++ b/arch/x86/events/Makefile
> @@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
>   obj-y					+= amd/
>   obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
>   obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
> +obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
> +obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(vo
>   		err = amd_pmu_init();
>   		x86_pmu.name = "HYGON";
>   		break;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		err = zhaoxin_pmu_init();
> +		break;
>   	default:
>   		err = -ENOTSUPP;
>   	}
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -618,6 +618,7 @@ struct x86_pmu {
>   
>   	/* PMI handler bits */
>   	unsigned int	late_ack		:1,
> +			enabled_ack		:1,
>   			counter_freezing	:1;
>   	/*
>   	 * sysfs attrs
> @@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabl
>   	return 0;
>   }
>   #endif /* CONFIG_CPU_SUP_INTEL */
> +
> +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
> +int zhaoxin_pmu_init(void);
> +#else
> +static inline int zhaoxin_pmu_init(void)
> +{
> +	return 0;
> +}
> +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
> --- /dev/null
> +++ b/arch/x86/events/zhaoxin/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-y	+= core.o
> --- /dev/null
> +++ b/arch/x86/events/zhaoxin/core.c
> @@ -0,0 +1,613 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Zhoaxin PMU; like Intel Architectural PerfMon-v2
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/stddef.h>
> +#include <linux/types.h>
> +#include <linux/init.h>
> +#include <linux/slab.h>
> +#include <linux/export.h>
> +#include <linux/nmi.h>
> +
> +#include <asm/cpufeature.h>
> +#include <asm/hardirq.h>
> +#include <asm/apic.h>
> +
> +#include "../perf_event.h"
> +
> +/*
> + * Zhaoxin PerfMon, used on zxc and later.
> + */
> +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
> +
> +	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
> +	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
> +	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
> +	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
> +	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
> +};
> +
> +static struct event_constraint zxc_event_constraints[] __read_mostly = {
> +
> +	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
> +	EVENT_CONSTRAINT_END
> +};
> +
> +static struct event_constraint zxd_event_constraints[] __read_mostly = {
> +
> +	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
> +	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
> +	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
> +	EVENT_CONSTRAINT_END
> +};
> +
> +static __initconst const u64 zxd_hw_cache_event_ids
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +[C(L1D)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0042,
> +		[C(RESULT_MISS)] = 0x0538,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0043,
> +		[C(RESULT_MISS)] = 0x0562,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(L1I)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0300,
> +		[C(RESULT_MISS)] = 0x0301,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x030a,
> +		[C(RESULT_MISS)] = 0x030b,
> +	},
> +},
> +[C(LL)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(DTLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0042,
> +		[C(RESULT_MISS)] = 0x052c,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0043,
> +		[C(RESULT_MISS)] = 0x0530,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0564,
> +		[C(RESULT_MISS)] = 0x0565,
> +	},
> +},
> +[C(ITLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x00c0,
> +		[C(RESULT_MISS)] = 0x0534,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(BPU)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0700,
> +		[C(RESULT_MISS)] = 0x0709,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(NODE)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +};
> +
> +static __initconst const u64 zxe_hw_cache_event_ids
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +[C(L1D)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0568,
> +		[C(RESULT_MISS)] = 0x054b,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0669,
> +		[C(RESULT_MISS)] = 0x0562,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(L1I)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0300,
> +		[C(RESULT_MISS)] = 0x0301,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x030a,
> +		[C(RESULT_MISS)] = 0x030b,
> +	},
> +},
> +[C(LL)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +},
> +[C(DTLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0568,
> +		[C(RESULT_MISS)] = 0x052c,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0669,
> +		[C(RESULT_MISS)] = 0x0530,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0564,
> +		[C(RESULT_MISS)] = 0x0565,
> +	},
> +},
> +[C(ITLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x00c0,
> +		[C(RESULT_MISS)] = 0x0534,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(BPU)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0028,
> +		[C(RESULT_MISS)] = 0x0029,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(NODE)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +};
> +
> +static void zhaoxin_pmu_disable_all(void)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +}
> +
> +static void zhaoxin_pmu_enable_all(int added)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
> +}
> +
> +static inline u64 zhaoxin_pmu_get_status(void)
> +{
> +	u64 status;
> +
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
> +
> +	return status;
> +}
> +
> +static inline void zhaoxin_pmu_ack_status(u64 ack)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
> +}
> +
> +static inline void zxc_pmu_ack_status(u64 ack)
> +{
> +	/*
> +	 * ZXC needs global control enabled in order to clear status bits.
> +	 */
> +	zhaoxin_pmu_enable_all(0);
> +	zhaoxin_pmu_ack_status(ack);
> +	zhaoxin_pmu_disable_all();
> +}
> +
> +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
> +{
> +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
> +	u64 ctrl_val, mask;
> +
> +	mask = 0xfULL << (idx * 4);
> +
> +	rdmsrl(hwc->config_base, ctrl_val);
> +	ctrl_val &= ~mask;
> +	wrmsrl(hwc->config_base, ctrl_val);
> +}
> +
> +static void zhaoxin_pmu_disable_event(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> +		zhaoxin_pmu_disable_fixed(hwc);
> +		return;
> +	}
> +
> +	x86_pmu_disable_event(event);
> +}
> +
> +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
> +{
> +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
> +	u64 ctrl_val, bits, mask;
> +
> +	/*
> +	 * Enable IRQ generation (0x8),
> +	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
> +	 * if requested:
> +	 */
> +	bits = 0x8ULL;
> +	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
> +		bits |= 0x2;
> +	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
> +		bits |= 0x1;
> +
> +	bits <<= (idx * 4);
> +	mask = 0xfULL << (idx * 4);
> +
> +	rdmsrl(hwc->config_base, ctrl_val);
> +	ctrl_val &= ~mask;
> +	ctrl_val |= bits;
> +	wrmsrl(hwc->config_base, ctrl_val);
> +}
> +
> +static void zhaoxin_pmu_enable_event(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> +		zhaoxin_pmu_enable_fixed(hwc);
> +		return;
> +	}
> +
> +	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
> +}
> +
> +/*
> + * This handler is triggered by the local APIC, so the APIC IRQ handling
> + * rules apply:
> + */
> +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
> +{
> +	struct perf_sample_data data;
> +	struct cpu_hw_events *cpuc;
> +	int handled = 0;
> +	u64 status;
> +	int bit;
> +
> +	cpuc = this_cpu_ptr(&cpu_hw_events);
> +	apic_write(APIC_LVTPC, APIC_DM_NMI);
> +	zhaoxin_pmu_disable_all();
> +	status = zhaoxin_pmu_get_status();
> +	if (!status)
> +		goto done;
> +
> +again:
> +	if (x86_pmu.enabled_ack)
> +		zxc_pmu_ack_status(status);
> +	else
> +		zhaoxin_pmu_ack_status(status);
> +
> +	inc_irq_stat(apic_perf_irqs);
> +
> +	/*
> +	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
> +	 * and clear the bit.
> +	 */
> +	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
> +		if (!status)
> +			goto done;
> +	}
> +
> +	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
> +		struct perf_event *event = cpuc->events[bit];
> +
> +		handled++;
> +
> +		if (!test_bit(bit, cpuc->active_mask))
> +			continue;
> +
> +		x86_perf_event_update(event);
> +		perf_sample_data_init(&data, 0, event->hw.last_period);
> +
> +		if (!x86_perf_event_set_period(event))
> +			continue;
> +
> +		if (perf_event_overflow(event, &data, regs))
> +			x86_pmu_stop(event, 0);
> +	}
> +
> +	/*
> +	 * Repeat if there is more work to be done:
> +	 */
> +	status = zhaoxin_pmu_get_status();
> +	if (status)
> +		goto again;
> +
> +done:
> +	zhaoxin_pmu_enable_all(0);
> +	return handled;
> +}
> +
> +static u64 zhaoxin_pmu_event_map(int hw_event)
> +{
> +	return zx_pmon_event_map[hw_event];
> +}
> +
> +static struct event_constraint *
> +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
> +			struct perf_event *event)
> +{
> +	struct event_constraint *c;
> +
> +	if (x86_pmu.event_constraints) {
> +		for_each_event_constraint(c, x86_pmu.event_constraints) {
> +			if ((event->hw.config & c->cmask) == c->code)
> +				return c;
> +		}
> +	}
> +
> +	return &unconstrained;
> +}
> +
> +PMU_FORMAT_ATTR(event,	"config:0-7");
> +PMU_FORMAT_ATTR(umask,	"config:8-15");
> +PMU_FORMAT_ATTR(edge,	"config:18");
> +PMU_FORMAT_ATTR(inv,	"config:23");
> +PMU_FORMAT_ATTR(cmask,	"config:24-31");
> +
> +static struct attribute *zx_arch_formats_attr[] = {
> +	&format_attr_event.attr,
> +	&format_attr_umask.attr,
> +	&format_attr_edge.attr,
> +	&format_attr_inv.attr,
> +	&format_attr_cmask.attr,
> +	NULL,
> +};
> +
> +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
> +{
> +	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
> +
> +	return x86_event_sysfs_show(page, config, event);
> +}
> +
> +static const struct x86_pmu zhaoxin_pmu __initconst = {
> +	.name			= "zhaoxin",
> +	.handle_irq		= zhaoxin_pmu_handle_irq,
> +	.disable_all		= zhaoxin_pmu_disable_all,
> +	.enable_all		= zhaoxin_pmu_enable_all,
> +	.enable			= zhaoxin_pmu_enable_event,
> +	.disable		= zhaoxin_pmu_disable_event,
> +	.hw_config		= x86_pmu_hw_config,
> +	.schedule_events	= x86_schedule_events,
> +	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
> +	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
> +	.event_map		= zhaoxin_pmu_event_map,
> +	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
> +	.apic			= 1,
> +	/*
> +	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
> +	 */
> +	.max_period		= (1ULL << 47) - 1,
> +	.get_event_constraints	= zhaoxin_get_event_constraints,
> +
> +	.format_attrs		= zx_arch_formats_attr,
> +	.events_sysfs_show	= zhaoxin_event_sysfs_show,
> +};
> +
> +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
> +	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
> +	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
> +	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
> +	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
> +	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
> +	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
> +	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
> +};
> +
> +static __init void zhaoxin_arch_events_quirk(void)
> +{
> +	int bit;
> +
> +	/* disable event that reported as not presend by cpuid */
> +	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
> +		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
> +		pr_warn("CPUID marked event: \'%s\' unavailable\n",
> +			zx_arch_events_map[bit].name);
> +	}
> +}
> +
> +__init int zhaoxin_pmu_init(void)
> +{
> +	union cpuid10_edx edx;
> +	union cpuid10_eax eax;
> +	union cpuid10_ebx ebx;
> +	struct event_constraint *c;
> +	unsigned int unused;
> +	int version;
> +
> +	pr_info("Welcome to zhaoxin pmu!\n");
> +
> +	/*
> +	 * Check whether the Architectural PerfMon supports
> +	 * hw_event or not.
> +	 */
> +	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
> +
> +	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
> +		return -ENODEV;
> +
> +	version = eax.split.version_id;
> +	if (version != 2)
> +		return -ENODEV;
> +
> +	x86_pmu = zhaoxin_pmu;
> +	pr_info("Version check pass!\n");
> +
> +	x86_pmu.version			= version;
> +	x86_pmu.num_counters		= eax.split.num_counters;
> +	x86_pmu.cntval_bits		= eax.split.bit_width;
> +	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
> +	x86_pmu.events_maskl		= ebx.full;
> +	x86_pmu.events_mask_len		= eax.split.mask_length;
> +
> +	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
> +	x86_add_quirk(zhaoxin_arch_events_quirk);
> +
> +	switch (boot_cpu_data.x86) {
> +	case 0x06:
> +		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
> +
> +			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
> +
> +			/* Clearing status works only if the global control is enable on zxc. */
> +			x86_pmu.enabled_ack = 1;
> +
> +			x86_pmu.event_constraints = zxc_event_constraints;
> +			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
> +
> +			pr_cont("ZXC events, ");
> +			break;
> +		}
> +		return -ENODEV;
> +
> +	case 0x07:
> +		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
> +			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
> +
> +		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
> +			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
> +
> +		switch (boot_cpu_data.x86_model) {
> +		case 0x1b:
> +			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
> +			       sizeof(hw_cache_event_ids));
> +
> +			x86_pmu.event_constraints = zxd_event_constraints;
> +
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
> +
> +			pr_cont("ZXD events, ");
> +			break;
> +		case 0x3b:
> +			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
> +			       sizeof(hw_cache_event_ids));
> +
> +			x86_pmu.event_constraints = zxd_event_constraints;
> +
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
> +
> +			pr_cont("ZXE events, ");
> +			break;
> +		default:
> +			return -ENODEV;
> +		}
> +		break;
> +
> +	default:
> +		return -ENODEV;
> +	}
> +
> +	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
> +	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
> +
> +	if (x86_pmu.event_constraints) {
> +		for_each_event_constraint(c, x86_pmu.event_constraints) {
> +			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
> +			c->weight += x86_pmu.num_counters;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> --- a/arch/x86/kernel/cpu/perfctr-watchdog.c
> +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
> @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_m
>   		case 15:
>   			return msr - MSR_P4_BPU_PERFCTR0;
>   		}
> +		fallthrough;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		return msr - MSR_ARCH_PERFMON_PERFCTR0;
>   	}
>   	return 0;
>   }
> @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_m
>   		case 15:
>   			return msr - MSR_P4_BSU_ESCR0;
>   		}
> +		fallthrough;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
>   	}
>   	return 0;
>   
> .
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip: perf/core] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-13  3:14 [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU CodyYao-oc
  2020-04-15  9:36 ` Borislav Petkov
  2020-04-15 10:23 ` Peter Zijlstra
@ 2020-05-01 18:22 ` tip-bot2 for CodyYao-oc
  2 siblings, 0 replies; 15+ messages in thread
From: tip-bot2 for CodyYao-oc @ 2020-05-01 18:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: kbuild test robot, CodyYao-oc, Peter Zijlstra (Intel), x86, LKML

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     3a4ac121c2cacbf97d493fa3bc42ead88657abe4
Gitweb:        https://git.kernel.org/tip/3a4ac121c2cacbf97d493fa3bc42ead88657abe4
Author:        CodyYao-oc <CodyYao-oc@zhaoxin.com>
AuthorDate:    Mon, 13 Apr 2020 11:14:29 +08:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 30 Apr 2020 20:14:35 +02:00

x86/perf: Add hardware performance events support for Zhaoxin CPU.

Zhaoxin CPU has provided facilities for monitoring performance
via PMU (Performance Monitor Unit), but the functionality is unused so far.
Therefore, add support for zhaoxin pmu to make performance related
hardware events available.

The PMU is mostly an Intel Architectural PerfMon-v2 with a novel
errata for the ZXC line. It supports the following events:

  -----------------------------------------------------------------------------------------------------------------------------------
  Event                      | Event  | Umask |          Description
			     | Select |       |
  -----------------------------------------------------------------------------------------------------------------------------------
  cpu-cycles                 |  82h   |  00h  | unhalt core clock
  instructions               |  00h   |  00h  | number of instructions at retirement.
  cache-references           |  15h   |  05h  | number of fillq pushs at the current cycle.
  cache-misses               |  1ah   |  05h  | number of l2 miss pushed by fillq.
  branch-instructions        |  28h   |  00h  | counts the number of branch instructions retired.
  branch-misses              |  29h   |  00h  | mispredicted branch instructions at retirement.
  bus-cycles                 |  83h   |  00h  | unhalt bus clock
  stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the # of Uops issued by the RAT to RS.
  stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
  L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
  L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose data source followed an L1 miss.
  L1-dcache-stores           |  69h   |  06h  | number of retire/commit Store,no LEA
  L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.
  L1-icache-loads            |  00h   |  03h  | number of l1i cache access for valid normal fetch,including un-cacheable access.
  L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss for valid normal fetch,including un-cacheable miss.
  L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
  L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
  dTLB-loads                 |  68h   |  05h  | number of retire/commit load
  dTLB-load-misses           |  2ch   |  05h  | number of load operations miss all level tlbs and cause a tablewalk.
  dTLB-stores                |  69h   |  06h  | number of retire/commit Store,no LEA
  dTLB-store-misses          |  30h   |  05h  | number of store operations miss all level tlbs and cause a tablewalk.
  dTLB-prefetches            |  64h   |  05h  | number of hardware pte prefetch requests dispatched out of the prefetch FIFO.
  dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte prefetch requests miss the l1d data cache.
  iTLB-load                  |  00h   |  00h  | actually counter instructions.
  iTLB-load-misses           |  34h   |  05h  | number of code operations miss all level tlbs and cause a tablewalk.
  -----------------------------------------------------------------------------------------------------------------------------------

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1586747669-4827-1-git-send-email-CodyYao-oc@zhaoxin.com
---
 arch/x86/events/Makefile               |   2 +-
 arch/x86/events/core.c                 |   4 +-
 arch/x86/events/perf_event.h           |  10 +-
 arch/x86/events/zhaoxin/Makefile       |   2 +-
 arch/x86/events/zhaoxin/core.c         | 613 ++++++++++++++++++++++++-
 arch/x86/kernel/cpu/perfctr-watchdog.c |   8 +-
 6 files changed, 639 insertions(+)
 create mode 100644 arch/x86/events/zhaoxin/Makefile
 create mode 100644 arch/x86/events/zhaoxin/core.c

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 9e07f55..6f1d1fd 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
 obj-y					+= amd/
 obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
+obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a619763..9e63ee5 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void)
 		err = amd_pmu_init();
 		x86_pmu.name = "HYGON";
 		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		err = zhaoxin_pmu_init();
+		break;
 	default:
 		err = -ENOTSUPP;
 	}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f1cd1ca..e17a3d8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -618,6 +618,7 @@ struct x86_pmu {
 
 	/* PMI handler bits */
 	unsigned int	late_ack		:1,
+			enabled_ack		:1,
 			counter_freezing	:1;
 	/*
 	 * sysfs attrs
@@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void)
 	return 0;
 }
 #endif /* CONFIG_CPU_SUP_INTEL */
+
+#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
+int zhaoxin_pmu_init(void);
+#else
+static inline int zhaoxin_pmu_init(void)
+{
+	return 0;
+}
+#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile
new file mode 100644
index 0000000..642c117
--- /dev/null
+++ b/arch/x86/events/zhaoxin/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y	+= core.o
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
new file mode 100644
index 0000000..898fa1a
--- /dev/null
+++ b/arch/x86/events/zhaoxin/core.c
@@ -0,0 +1,613 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Zhoaxin PMU; like Intel Architectural PerfMon-v2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Zhaoxin PerfMon, used on zxc and later.
+ */
+static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
+
+	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
+	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
+	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
+	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
+};
+
+static struct event_constraint zxc_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint zxd_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static __initconst const u64 zxd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x0538,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0700,
+		[C(RESULT_MISS)] = 0x0709,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static __initconst const u64 zxe_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x054b,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0028,
+		[C(RESULT_MISS)] = 0x0029,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static void zhaoxin_pmu_disable_all(void)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void zhaoxin_pmu_enable_all(int added)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static inline u64 zhaoxin_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void zhaoxin_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void zxc_pmu_ack_status(u64 ack)
+{
+	/*
+	 * ZXC needs global control enabled in order to clear status bits.
+	 */
+	zhaoxin_pmu_enable_all(0);
+	zhaoxin_pmu_ack_status(ack);
+	zhaoxin_pmu_disable_all();
+}
+
+static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+}
+
+static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int handled = 0;
+	u64 status;
+	int bit;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	zhaoxin_pmu_disable_all();
+	status = zhaoxin_pmu_get_status();
+	if (!status)
+		goto done;
+
+again:
+	if (x86_pmu.enabled_ack)
+		zxc_pmu_ack_status(status);
+	else
+		zhaoxin_pmu_ack_status(status);
+
+	inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
+	 * and clear the bit.
+	 */
+	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+		if (!status)
+			goto done;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		x86_perf_event_update(event);
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = zhaoxin_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	zhaoxin_pmu_enable_all(0);
+	return handled;
+}
+
+static u64 zhaoxin_pmu_event_map(int hw_event)
+{
+	return zx_pmon_event_map[hw_event];
+}
+
+static struct event_constraint *
+zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &unconstrained;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7");
+PMU_FORMAT_ATTR(umask,	"config:8-15");
+PMU_FORMAT_ATTR(edge,	"config:18");
+PMU_FORMAT_ATTR(inv,	"config:23");
+PMU_FORMAT_ATTR(cmask,	"config:24-31");
+
+static struct attribute *zx_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static const struct x86_pmu zhaoxin_pmu __initconst = {
+	.name			= "zhaoxin",
+	.handle_irq		= zhaoxin_pmu_handle_irq,
+	.disable_all		= zhaoxin_pmu_disable_all,
+	.enable_all		= zhaoxin_pmu_enable_all,
+	.enable			= zhaoxin_pmu_enable_event,
+	.disable		= zhaoxin_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= zhaoxin_pmu_event_map,
+	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
+	.apic			= 1,
+	/*
+	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
+	 */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= zhaoxin_get_event_constraints,
+
+	.format_attrs		= zx_arch_formats_attr,
+	.events_sysfs_show	= zhaoxin_event_sysfs_show,
+};
+
+static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void zhaoxin_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
+		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+			zx_arch_events_map[bit].name);
+	}
+}
+
+__init int zhaoxin_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	int version;
+
+	pr_info("Welcome to zhaoxin pmu!\n");
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version != 2)
+		return -ENODEV;
+
+	x86_pmu = zhaoxin_pmu;
+	pr_info("Version check pass!\n");
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_add_quirk(zhaoxin_arch_events_quirk);
+
+	switch (boot_cpu_data.x86) {
+	case 0x06:
+		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
+
+			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+
+			/* Clearing status works only if the global control is enable on zxc. */
+			x86_pmu.enabled_ack = 1;
+
+			x86_pmu.event_constraints = zxc_event_constraints;
+			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
+
+			pr_cont("ZXC events, ");
+			break;
+		}
+		return -ENODEV;
+
+	case 0x07:
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+
+		switch (boot_cpu_data.x86_model) {
+		case 0x1b:
+			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
+
+			pr_cont("ZXD events, ");
+			break;
+		case 0x3b:
+			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
+
+			pr_cont("ZXE events, ");
+			break;
+		default:
+			return -ENODEV;
+		}
+		break;
+
+	default:
+		return -ENODEV;
+	}
+
+	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
+	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
+		}
+	}
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9556930..a5ee607 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BPU_PERFCTR0;
 		}
+		fallthrough;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_PERFCTR0;
 	}
 	return 0;
 }
@@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BSU_ESCR0;
 		}
+		fallthrough;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
 	}
 	return 0;
 

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-15 10:31   ` Peter Zijlstra
  2020-04-16  7:36     ` CodyYao-oc
@ 2020-05-06 12:55     ` CodyYao-oc
  1 sibling, 0 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-05-06 12:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	tglx, bp, x86, hpa, linux-kernel, cooperyan, codyyao

On 2020/4/15 下午6:31, Peter Zijlstra wrote:
> On Wed, Apr 15, 2020 at 12:23:40PM +0200, Peter Zijlstra wrote:
>> On Mon, Apr 13, 2020 at 11:14:29AM +0800, CodyYao-oc wrote:
>>> Zhaoxin CPU has provided facilities for monitoring performance
>>> via PMU(Performance Monitor Unit), but the functionality is unused so far.
>>> Therefore, add support for zhaoxin pmu to make performance related
>>> hardware events available.
>>>
>>> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
>>> Reported-by: kbuild test robot <lkp@intel.com>
>>
>> What's that reported-by thing? Did the robot complain you didn't have a
>> PMU implementation?
>>
>> Anyway, I've made the below changes to the patch.
> 
> In whole, the patch now looks like this. I've also added that event
> table you provided last time to the Changelog.
> 
> ---
> Subject: x86/perf: Add hardware performance events support for Zhaoxin CPU.
> From: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> Date: Mon, 13 Apr 2020 11:14:29 +0800
> 
> From: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> 
> Zhaoxin CPU has provided facilities for monitoring performance
> via PMU (Performance Monitor Unit), but the functionality is unused so far.
> Therefore, add support for zhaoxin pmu to make performance related
> hardware events available.
> 
> The PMU is mostly an Intel Architectural PerfMon-v2 with a novel
> errata for the ZXC line. It supports the following events:
> 
>    -----------------------------------------------------------------------------------------------------------------------------------
>    Event                      | Event  | Umask |          Description
> 			     | Select |       |
>    -----------------------------------------------------------------------------------------------------------------------------------
>    cpu-cycles                 |  82h   |  00h  | unhalt core clock
>    instructions               |  00h   |  00h  | number of instructions at retirement.
>    cache-references           |  15h   |  05h  | number of fillq pushs at the current cycle.
>    cache-misses               |  1ah   |  05h  | number of l2 miss pushed by fillq.
>    branch-instructions        |  28h   |  00h  | counts the number of branch instructions retired.
>    branch-misses              |  29h   |  00h  | mispredicted branch instructions at retirement.
>    bus-cycles                 |  83h   |  00h  | unhalt bus clock
>    stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the # of Uops issued by the RAT to RS.
>    stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
>    L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
>    L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose data source followed an L1 miss.
>    L1-dcache-stores           |  69h   |  06h  | number of retire/commit Store,no LEA
>    L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.
>    L1-icache-loads            |  00h   |  03h  | number of l1i cache access for valid normal fetch,including un-cacheable access.
>    L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss for valid normal fetch,including un-cacheable miss.
>    L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
>    L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
>    dTLB-loads                 |  68h   |  05h  | number of retire/commit load
>    dTLB-load-misses           |  2ch   |  05h  | number of load operations miss all level tlbs and cause a tablewalk.
>    dTLB-stores                |  69h   |  06h  | number of retire/commit Store,no LEA
>    dTLB-store-misses          |  30h   |  05h  | number of store operations miss all level tlbs and cause a tablewalk.
>    dTLB-prefetches            |  64h   |  05h  | number of hardware pte prefetch requests dispatched out of the prefetch FIFO.
>    dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte prefetch requests miss the l1d data cache.
>    iTLB-load                  |  00h   |  00h  | actually counter instructions.
>    iTLB-load-misses           |  34h   |  05h  | number of code operations miss all level tlbs and cause a tablewalk.
>    -----------------------------------------------------------------------------------------------------------------------------------
> 
> Reported-by: kbuild test robot <lkp@intel.com>
> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://lkml.kernel.org/r/1586747669-4827-1-git-send-email-CodyYao-oc@zhaoxin.com
> ---
>   arch/x86/events/Makefile               |    2
>   arch/x86/events/core.c                 |    4
>   arch/x86/events/perf_event.h           |   10
>   arch/x86/events/zhaoxin/Makefile       |    2
>   arch/x86/events/zhaoxin/core.c         |  613 +++++++++++++++++++++++++++++++++
>   arch/x86/kernel/cpu/perfctr-watchdog.c |    8
>   6 files changed, 639 insertions(+)
>   create mode 100644 arch/x86/events/zhaoxin/Makefile
>   create mode 100644 arch/x86/events/zhaoxin/core.c
> 
> --- a/arch/x86/events/Makefile
> +++ b/arch/x86/events/Makefile
> @@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
>   obj-y					+= amd/
>   obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
>   obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
> +obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
> +obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(vo
>   		err = amd_pmu_init();
>   		x86_pmu.name = "HYGON";
>   		break;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		err = zhaoxin_pmu_init();
> +		break;
>   	default:
>   		err = -ENOTSUPP;
>   	}
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -618,6 +618,7 @@ struct x86_pmu {
>   
>   	/* PMI handler bits */
>   	unsigned int	late_ack		:1,
> +			enabled_ack		:1,
>   			counter_freezing	:1;
>   	/*
>   	 * sysfs attrs
> @@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabl
>   	return 0;
>   }
>   #endif /* CONFIG_CPU_SUP_INTEL */
> +
> +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
> +int zhaoxin_pmu_init(void);
> +#else
> +static inline int zhaoxin_pmu_init(void)
> +{
> +	return 0;
> +}
> +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
> --- /dev/null
> +++ b/arch/x86/events/zhaoxin/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-y	+= core.o
> --- /dev/null
> +++ b/arch/x86/events/zhaoxin/core.c
> @@ -0,0 +1,613 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Zhoaxin PMU; like Intel Architectural PerfMon-v2
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/stddef.h>
> +#include <linux/types.h>
> +#include <linux/init.h>
> +#include <linux/slab.h>
> +#include <linux/export.h>
> +#include <linux/nmi.h>
> +
> +#include <asm/cpufeature.h>
> +#include <asm/hardirq.h>
> +#include <asm/apic.h>
> +
> +#include "../perf_event.h"
> +
> +/*
> + * Zhaoxin PerfMon, used on zxc and later.
> + */
> +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
> +
> +	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
> +	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
> +	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
> +	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
> +	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
> +};
> +
> +static struct event_constraint zxc_event_constraints[] __read_mostly = {
> +
> +	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
> +	EVENT_CONSTRAINT_END
> +};
> +
> +static struct event_constraint zxd_event_constraints[] __read_mostly = {
> +
> +	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
> +	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
> +	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
> +	EVENT_CONSTRAINT_END
> +};
> +
> +static __initconst const u64 zxd_hw_cache_event_ids
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +[C(L1D)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0042,
> +		[C(RESULT_MISS)] = 0x0538,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0043,
> +		[C(RESULT_MISS)] = 0x0562,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(L1I)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0300,
> +		[C(RESULT_MISS)] = 0x0301,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x030a,
> +		[C(RESULT_MISS)] = 0x030b,
> +	},
> +},
> +[C(LL)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(DTLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0042,
> +		[C(RESULT_MISS)] = 0x052c,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0043,
> +		[C(RESULT_MISS)] = 0x0530,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0564,
> +		[C(RESULT_MISS)] = 0x0565,
> +	},
> +},
> +[C(ITLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x00c0,
> +		[C(RESULT_MISS)] = 0x0534,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(BPU)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0700,
> +		[C(RESULT_MISS)] = 0x0709,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(NODE)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +};
> +
> +static __initconst const u64 zxe_hw_cache_event_ids
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +[C(L1D)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0568,
> +		[C(RESULT_MISS)] = 0x054b,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0669,
> +		[C(RESULT_MISS)] = 0x0562,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(L1I)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0300,
> +		[C(RESULT_MISS)] = 0x0301,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x030a,
> +		[C(RESULT_MISS)] = 0x030b,
> +	},
> +},
> +[C(LL)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +},
> +[C(DTLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0568,
> +		[C(RESULT_MISS)] = 0x052c,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0669,
> +		[C(RESULT_MISS)] = 0x0530,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0564,
> +		[C(RESULT_MISS)] = 0x0565,
> +	},
> +},
> +[C(ITLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x00c0,
> +		[C(RESULT_MISS)] = 0x0534,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(BPU)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0028,
> +		[C(RESULT_MISS)] = 0x0029,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(NODE)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +};
> +
> +static void zhaoxin_pmu_disable_all(void)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +}
> +
> +static void zhaoxin_pmu_enable_all(int added)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
> +}
> +
> +static inline u64 zhaoxin_pmu_get_status(void)
> +{
> +	u64 status;
> +
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
> +
> +	return status;
> +}
> +
> +static inline void zhaoxin_pmu_ack_status(u64 ack)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
> +}
> +
> +static inline void zxc_pmu_ack_status(u64 ack)
> +{
> +	/*
> +	 * ZXC needs global control enabled in order to clear status bits.
> +	 */
> +	zhaoxin_pmu_enable_all(0);
> +	zhaoxin_pmu_ack_status(ack);
> +	zhaoxin_pmu_disable_all();
> +}
> +
> +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
> +{
> +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
> +	u64 ctrl_val, mask;
> +
> +	mask = 0xfULL << (idx * 4);
> +
> +	rdmsrl(hwc->config_base, ctrl_val);
> +	ctrl_val &= ~mask;
> +	wrmsrl(hwc->config_base, ctrl_val);
> +}
> +
> +static void zhaoxin_pmu_disable_event(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> +		zhaoxin_pmu_disable_fixed(hwc);
> +		return;
> +	}
> +
> +	x86_pmu_disable_event(event);
> +}
> +
> +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
> +{
> +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
> +	u64 ctrl_val, bits, mask;
> +
> +	/*
> +	 * Enable IRQ generation (0x8),
> +	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
> +	 * if requested:
> +	 */
> +	bits = 0x8ULL;
> +	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
> +		bits |= 0x2;
> +	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
> +		bits |= 0x1;
> +
> +	bits <<= (idx * 4);
> +	mask = 0xfULL << (idx * 4);
> +
> +	rdmsrl(hwc->config_base, ctrl_val);
> +	ctrl_val &= ~mask;
> +	ctrl_val |= bits;
> +	wrmsrl(hwc->config_base, ctrl_val);
> +}
> +
> +static void zhaoxin_pmu_enable_event(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> +		zhaoxin_pmu_enable_fixed(hwc);
> +		return;
> +	}
> +
> +	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
> +}
> +
> +/*
> + * This handler is triggered by the local APIC, so the APIC IRQ handling
> + * rules apply:
> + */
> +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
> +{
> +	struct perf_sample_data data;
> +	struct cpu_hw_events *cpuc;
> +	int handled = 0;
> +	u64 status;
> +	int bit;
> +
> +	cpuc = this_cpu_ptr(&cpu_hw_events);
> +	apic_write(APIC_LVTPC, APIC_DM_NMI);
> +	zhaoxin_pmu_disable_all();
> +	status = zhaoxin_pmu_get_status();
> +	if (!status)
> +		goto done;
> +
> +again:
> +	if (x86_pmu.enabled_ack)
> +		zxc_pmu_ack_status(status);
> +	else
> +		zhaoxin_pmu_ack_status(status);
> +
> +	inc_irq_stat(apic_perf_irqs);
> +
> +	/*
> +	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
> +	 * and clear the bit.
> +	 */
> +	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
> +		if (!status)
> +			goto done;
> +	}
> +
> +	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
> +		struct perf_event *event = cpuc->events[bit];
> +
> +		handled++;
> +
> +		if (!test_bit(bit, cpuc->active_mask))
> +			continue;
> +
> +		x86_perf_event_update(event);
> +		perf_sample_data_init(&data, 0, event->hw.last_period);
> +
> +		if (!x86_perf_event_set_period(event))
> +			continue;
> +
> +		if (perf_event_overflow(event, &data, regs))
> +			x86_pmu_stop(event, 0);
> +	}
> +
> +	/*
> +	 * Repeat if there is more work to be done:
> +	 */
> +	status = zhaoxin_pmu_get_status();
> +	if (status)
> +		goto again;
> +
> +done:
> +	zhaoxin_pmu_enable_all(0);
> +	return handled;
> +}
> +
> +static u64 zhaoxin_pmu_event_map(int hw_event)
> +{
> +	return zx_pmon_event_map[hw_event];
> +}
> +
> +static struct event_constraint *
> +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
> +			struct perf_event *event)
> +{
> +	struct event_constraint *c;
> +
> +	if (x86_pmu.event_constraints) {
> +		for_each_event_constraint(c, x86_pmu.event_constraints) {
> +			if ((event->hw.config & c->cmask) == c->code)
> +				return c;
> +		}
> +	}
> +
> +	return &unconstrained;
> +}
> +
> +PMU_FORMAT_ATTR(event,	"config:0-7");
> +PMU_FORMAT_ATTR(umask,	"config:8-15");
> +PMU_FORMAT_ATTR(edge,	"config:18");
> +PMU_FORMAT_ATTR(inv,	"config:23");
> +PMU_FORMAT_ATTR(cmask,	"config:24-31");
> +
> +static struct attribute *zx_arch_formats_attr[] = {
> +	&format_attr_event.attr,
> +	&format_attr_umask.attr,
> +	&format_attr_edge.attr,
> +	&format_attr_inv.attr,
> +	&format_attr_cmask.attr,
> +	NULL,
> +};
> +
> +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
> +{
> +	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
> +
> +	return x86_event_sysfs_show(page, config, event);
> +}
> +
> +static const struct x86_pmu zhaoxin_pmu __initconst = {
> +	.name			= "zhaoxin",
> +	.handle_irq		= zhaoxin_pmu_handle_irq,
> +	.disable_all		= zhaoxin_pmu_disable_all,
> +	.enable_all		= zhaoxin_pmu_enable_all,
> +	.enable			= zhaoxin_pmu_enable_event,
> +	.disable		= zhaoxin_pmu_disable_event,
> +	.hw_config		= x86_pmu_hw_config,
> +	.schedule_events	= x86_schedule_events,
> +	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
> +	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
> +	.event_map		= zhaoxin_pmu_event_map,
> +	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
> +	.apic			= 1,
> +	/*
> +	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
> +	 */
> +	.max_period		= (1ULL << 47) - 1,
> +	.get_event_constraints	= zhaoxin_get_event_constraints,
> +
> +	.format_attrs		= zx_arch_formats_attr,
> +	.events_sysfs_show	= zhaoxin_event_sysfs_show,
> +};
> +
> +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
> +	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
> +	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
> +	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
> +	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
> +	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
> +	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
> +	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
> +};
> +
> +static __init void zhaoxin_arch_events_quirk(void)
> +{
> +	int bit;
> +
> +	/* disable event that reported as not presend by cpuid */
> +	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
> +		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
> +		pr_warn("CPUID marked event: \'%s\' unavailable\n",
> +			zx_arch_events_map[bit].name);
> +	}
> +}
> +
> +__init int zhaoxin_pmu_init(void)
> +{
> +	union cpuid10_edx edx;
> +	union cpuid10_eax eax;
> +	union cpuid10_ebx ebx;
> +	struct event_constraint *c;
> +	unsigned int unused;
> +	int version;
> +
> +	pr_info("Welcome to zhaoxin pmu!\n");
> +
> +	/*
> +	 * Check whether the Architectural PerfMon supports
> +	 * hw_event or not.
> +	 */
> +	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
> +
> +	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
> +		return -ENODEV;
> +
> +	version = eax.split.version_id;
> +	if (version != 2)
> +		return -ENODEV;
> +
> +	x86_pmu = zhaoxin_pmu;
> +	pr_info("Version check pass!\n");
> +
> +	x86_pmu.version			= version;
> +	x86_pmu.num_counters		= eax.split.num_counters;
> +	x86_pmu.cntval_bits		= eax.split.bit_width;
> +	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
> +	x86_pmu.events_maskl		= ebx.full;
> +	x86_pmu.events_mask_len		= eax.split.mask_length;
> +
> +	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
> +	x86_add_quirk(zhaoxin_arch_events_quirk);
> +
> +	switch (boot_cpu_data.x86) {
> +	case 0x06:
> +		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
> +
> +			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
> +
> +			/* Clearing status works only if the global control is enable on zxc. */
> +			x86_pmu.enabled_ack = 1;
> +
> +			x86_pmu.event_constraints = zxc_event_constraints;
> +			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
> +
> +			pr_cont("ZXC events, ");
> +			break;
> +		}
> +		return -ENODEV;
> +
> +	case 0x07:
> +		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
> +			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
> +
> +		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
> +			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
> +
> +		switch (boot_cpu_data.x86_model) {
> +		case 0x1b:
> +			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
> +			       sizeof(hw_cache_event_ids));
> +
> +			x86_pmu.event_constraints = zxd_event_constraints;
> +
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
> +
> +			pr_cont("ZXD events, ");
> +			break;
> +		case 0x3b:
> +			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
> +			       sizeof(hw_cache_event_ids));
> +
> +			x86_pmu.event_constraints = zxd_event_constraints;
> +
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
> +
> +			pr_cont("ZXE events, ");
> +			break;
> +		default:
> +			return -ENODEV;
> +		}
> +		break;
> +
> +	default:
> +		return -ENODEV;
> +	}
> +
> +	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
> +	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
> +
> +	if (x86_pmu.event_constraints) {
> +		for_each_event_constraint(c, x86_pmu.event_constraints) {
> +			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
> +			c->weight += x86_pmu.num_counters;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> --- a/arch/x86/kernel/cpu/perfctr-watchdog.c
> +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
> @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_m
>   		case 15:
>   			return msr - MSR_P4_BPU_PERFCTR0;
>   		}
> +		fallthrough;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		return msr - MSR_ARCH_PERFMON_PERFCTR0;
>   	}
>   	return 0;
>   }
> @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_m
>   		case 15:
>   			return msr - MSR_P4_BSU_ESCR0;
>   		}
> +		fallthrough;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
>   	}
>   	return 0;
>   
> .
> 
Dear Peter,

I'm so sorry that my coding errors have caused a misunderstanding. The 
above 'fallthrough' should be 'break' in perfctr-watchdog.c, I changed 
it when I resend the patch. But it looks like that there is no need to 
resend the patch, becasue I got an email on May 2nd from tip-bot2, which 
tells me this commit has been merged into the perf/core branch of tip.

Is there any way to change 'fallthrough' to 'break'? I guess I have no 
permission to change the perf/core branch of tip, if you can, please help.

Thank you very much for your patience and kind help.

Cody.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
@ 2020-04-27 12:30 CodyYao-oc
  0 siblings, 0 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-04-27 12:30 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, tglx, bp, x86, hpa, linux-kernel
  Cc: cooperyan, codyyao, shyaololo, CodyYao-oc

Zhaoxin CPU has provided facilities for monitoring performance
via PMU (Performance Monitor Unit), but the functionality is unused so far.
Therefore, add support for zhaoxin pmu to make performance related
hardware events available.

The PMU is mostly an Intel Architectural PerfMon-v2 with a novel
errata for the ZXC line. It supports the following events:

  -----------------------------------------------------------------------------------------------------------------------------------
  Event                      | Event  | Umask |          Description
			     | Select |       |
  -----------------------------------------------------------------------------------------------------------------------------------
  cpu-cycles                 |  82h   |  00h  | unhalt core clock
  instructions               |  00h   |  00h  | number of instructions at retirement.
  cache-references           |  15h   |  05h  | number of fillq pushs at the current cycle.
  cache-misses               |  1ah   |  05h  | number of l2 miss pushed by fillq.
  branch-instructions        |  28h   |  00h  | counts the number of branch instructions retired.
  branch-misses              |  29h   |  00h  | mispredicted branch instructions at retirement.
  bus-cycles                 |  83h   |  00h  | unhalt bus clock
  stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the # of Uops issued by the RAT to RS.
  stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
  L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
  L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose data source followed an L1 miss.
  L1-dcache-stores           |  69h   |  06h  | number of retire/commit Store,no LEA
  L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.
  L1-icache-loads            |  00h   |  03h  | number of l1i cache access for valid normal fetch,including un-cacheable access.
  L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss for valid normal fetch,including un-cacheable miss.
  L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
  L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
  dTLB-loads                 |  68h   |  05h  | number of retire/commit load
  dTLB-load-misses           |  2ch   |  05h  | number of load operations miss all level tlbs and cause a tablewalk.
  dTLB-stores                |  69h   |  06h  | number of retire/commit Store,no LEA
  dTLB-store-misses          |  30h   |  05h  | number of store operations miss all level tlbs and cause a tablewalk.
  dTLB-prefetches            |  64h   |  05h  | number of hardware pte prefetch requests dispatched out of the prefetch FIFO.
  dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte prefetch requests miss the l1d data cache.
  iTLB-load                  |  00h   |  00h  | actually counter instructions.
  iTLB-load-misses           |  34h   |  05h  | number of code operations miss all level tlbs and cause a tablewalk.
  -----------------------------------------------------------------------------------------------------------------------------------

[All warnings (new ones prefixed by >>):

>> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
           if (boot_cpu_data.x86 == 0x06 &&
               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs here
           if (is_zxc)
               ^~~~~~
   arch/x86/events/zhaoxin/core.c:362:2: note: remove the 'if' if its condition is always true
           if (boot_cpu_data.x86 == 0x06 &&
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is used uninitialized whenever '&&' condition is false [-Wsometimes-uninitialized]
           if (boot_cpu_data.x86 == 0x06 &&
               ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs here
           if (is_zxc)
               ^~~~~~
   arch/x86/events/zhaoxin/core.c:362:6: note: remove the '&&' if its condition is always true
           if (boot_cpu_data.x86 == 0x06 &&
               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/events/zhaoxin/core.c:352:13: note: initialize the variable 'is_zxc' to silence this warning
           bool is_zxc;
                      ^
                       = 0
   2 warnings generated.]

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/events/Makefile               |   2 +
 arch/x86/events/core.c                 |   4 +
 arch/x86/events/perf_event.h           |  10 +
 arch/x86/events/zhaoxin/Makefile       |   2 +
 arch/x86/events/zhaoxin/core.c         | 613 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perfctr-watchdog.c |   8 +
 6 files changed, 639 insertions(+)
 create mode 100644 arch/x86/events/zhaoxin/Makefile
 create mode 100644 arch/x86/events/zhaoxin/core.c

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 9e07f55..6f1d1fd 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
 obj-y					+= amd/
 obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
+obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a619763..9e63ee5 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void)
 		err = amd_pmu_init();
 		x86_pmu.name = "HYGON";
 		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		err = zhaoxin_pmu_init();
+		break;
 	default:
 		err = -ENOTSUPP;
 	}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f1cd1ca..e17a3d8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -618,6 +618,7 @@ struct x86_pmu {
 
 	/* PMI handler bits */
 	unsigned int	late_ack		:1,
+			enabled_ack		:1,
 			counter_freezing	:1;
 	/*
 	 * sysfs attrs
@@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void)
 	return 0;
 }
 #endif /* CONFIG_CPU_SUP_INTEL */
+
+#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
+int zhaoxin_pmu_init(void);
+#else
+static inline int zhaoxin_pmu_init(void)
+{
+	return 0;
+}
+#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile
new file mode 100644
index 0000000..642c1174
--- /dev/null
+++ b/arch/x86/events/zhaoxin/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y	+= core.o
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
new file mode 100644
index 0000000..898fa1a
--- /dev/null
+++ b/arch/x86/events/zhaoxin/core.c
@@ -0,0 +1,613 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Zhoaxin PMU; like Intel Architectural PerfMon-v2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Zhaoxin PerfMon, used on zxc and later.
+ */
+static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
+
+	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
+	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
+	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
+	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
+};
+
+static struct event_constraint zxc_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint zxd_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static __initconst const u64 zxd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x0538,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0700,
+		[C(RESULT_MISS)] = 0x0709,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static __initconst const u64 zxe_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x054b,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0028,
+		[C(RESULT_MISS)] = 0x0029,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static void zhaoxin_pmu_disable_all(void)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void zhaoxin_pmu_enable_all(int added)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static inline u64 zhaoxin_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void zhaoxin_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void zxc_pmu_ack_status(u64 ack)
+{
+	/*
+	 * ZXC needs global control enabled in order to clear status bits.
+	 */
+	zhaoxin_pmu_enable_all(0);
+	zhaoxin_pmu_ack_status(ack);
+	zhaoxin_pmu_disable_all();
+}
+
+static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+}
+
+static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int handled = 0;
+	u64 status;
+	int bit;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	zhaoxin_pmu_disable_all();
+	status = zhaoxin_pmu_get_status();
+	if (!status)
+		goto done;
+
+again:
+	if (x86_pmu.enabled_ack)
+		zxc_pmu_ack_status(status);
+	else
+		zhaoxin_pmu_ack_status(status);
+
+	inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
+	 * and clear the bit.
+	 */
+	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+		if (!status)
+			goto done;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		x86_perf_event_update(event);
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = zhaoxin_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	zhaoxin_pmu_enable_all(0);
+	return handled;
+}
+
+static u64 zhaoxin_pmu_event_map(int hw_event)
+{
+	return zx_pmon_event_map[hw_event];
+}
+
+static struct event_constraint *
+zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &unconstrained;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7");
+PMU_FORMAT_ATTR(umask,	"config:8-15");
+PMU_FORMAT_ATTR(edge,	"config:18");
+PMU_FORMAT_ATTR(inv,	"config:23");
+PMU_FORMAT_ATTR(cmask,	"config:24-31");
+
+static struct attribute *zx_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static const struct x86_pmu zhaoxin_pmu __initconst = {
+	.name			= "zhaoxin",
+	.handle_irq		= zhaoxin_pmu_handle_irq,
+	.disable_all		= zhaoxin_pmu_disable_all,
+	.enable_all		= zhaoxin_pmu_enable_all,
+	.enable			= zhaoxin_pmu_enable_event,
+	.disable		= zhaoxin_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= zhaoxin_pmu_event_map,
+	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
+	.apic			= 1,
+	/*
+	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
+	 */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= zhaoxin_get_event_constraints,
+
+	.format_attrs		= zx_arch_formats_attr,
+	.events_sysfs_show	= zhaoxin_event_sysfs_show,
+};
+
+static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void zhaoxin_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
+		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+			zx_arch_events_map[bit].name);
+	}
+}
+
+__init int zhaoxin_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	int version;
+
+	pr_info("Welcome to zhaoxin pmu!\n");
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version != 2)
+		return -ENODEV;
+
+	x86_pmu = zhaoxin_pmu;
+	pr_info("Version check pass!\n");
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_add_quirk(zhaoxin_arch_events_quirk);
+
+	switch (boot_cpu_data.x86) {
+	case 0x06:
+		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
+
+			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+
+			/* Clearing status works only if the global control is enable on zxc. */
+			x86_pmu.enabled_ack = 1;
+
+			x86_pmu.event_constraints = zxc_event_constraints;
+			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
+
+			pr_cont("ZXC events, ");
+			break;
+		}
+		return -ENODEV;
+
+	case 0x07:
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+
+		switch (boot_cpu_data.x86_model) {
+		case 0x1b:
+			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
+
+			pr_cont("ZXD events, ");
+			break;
+		case 0x3b:
+			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
+
+			pr_cont("ZXE events, ");
+			break;
+		default:
+			return -ENODEV;
+		}
+		break;
+
+	default:
+		return -ENODEV;
+	}
+
+	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
+	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
+		}
+	}
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9556930..a548d91 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BPU_PERFCTR0;
 		}
+		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_PERFCTR0;
 	}
 	return 0;
 }
@@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BSU_ESCR0;
 		}
+		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
 	}
 	return 0;
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-04-19 13:10 CodyYao-oc
@ 2020-04-26  3:04 ` CodyYao-oc
  0 siblings, 0 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-04-26  3:04 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, tglx, bp, x86, hpa, linux-kernel
  Cc: cooperyan, codyyao

On 2020/4/19 下午9:10, CodyYao-oc wrote:
> Zhaoxin CPU has provided facilities for monitoring performance
> via PMU (Performance Monitor Unit), but the functionality is unused so far.
> Therefore, add support for zhaoxin pmu to make performance related
> hardware events available.
> 
> The PMU is mostly an Intel Architectural PerfMon-v2 with a novel
> errata for the ZXC line. It supports the following events:
> 
>    -----------------------------------------------------------------------------------------------------------------------------------
>    Event                      | Event  | Umask |          Description
> 			     | Select |       |
>    -----------------------------------------------------------------------------------------------------------------------------------
>    cpu-cycles                 |  82h   |  00h  | unhalt core clock
>    instructions               |  00h   |  00h  | number of instructions at retirement.
>    cache-references           |  15h   |  05h  | number of fillq pushs at the current cycle.
>    cache-misses               |  1ah   |  05h  | number of l2 miss pushed by fillq.
>    branch-instructions        |  28h   |  00h  | counts the number of branch instructions retired.
>    branch-misses              |  29h   |  00h  | mispredicted branch instructions at retirement.
>    bus-cycles                 |  83h   |  00h  | unhalt bus clock
>    stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the # of Uops issued by the RAT to RS.
>    stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
>    L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
>    L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose data source followed an L1 miss.
>    L1-dcache-stores           |  69h   |  06h  | number of retire/commit Store,no LEA
>    L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.
>    L1-icache-loads            |  00h   |  03h  | number of l1i cache access for valid normal fetch,including un-cacheable access.
>    L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss for valid normal fetch,including un-cacheable miss.
>    L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
>    L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
>    dTLB-loads                 |  68h   |  05h  | number of retire/commit load
>    dTLB-load-misses           |  2ch   |  05h  | number of load operations miss all level tlbs and cause a tablewalk.
>    dTLB-stores                |  69h   |  06h  | number of retire/commit Store,no LEA
>    dTLB-store-misses          |  30h   |  05h  | number of store operations miss all level tlbs and cause a tablewalk.
>    dTLB-prefetches            |  64h   |  05h  | number of hardware pte prefetch requests dispatched out of the prefetch FIFO.
>    dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte prefetch requests miss the l1d data cache.
>    iTLB-load                  |  00h   |  00h  | actually counter instructions.
>    iTLB-load-misses           |  34h   |  05h  | number of code operations miss all level tlbs and cause a tablewalk.
>    -----------------------------------------------------------------------------------------------------------------------------------
> 
> [All warnings (new ones prefixed by >>):
> 
>>> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
>             if (boot_cpu_data.x86 == 0x06 &&
>                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
>     arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs here
>             if (is_zxc)
>                 ^~~~~~
>     arch/x86/events/zhaoxin/core.c:362:2: note: remove the 'if' if its condition is always true
>             if (boot_cpu_data.x86 == 0x06 &&
>             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>>> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is used uninitialized whenever '&&' condition is false [-Wsometimes-uninitialized]
>             if (boot_cpu_data.x86 == 0x06 &&
>                 ^~~~~~~~~~~~~~~~~~~~~~~~~
>     arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs here
>             if (is_zxc)
>                 ^~~~~~
>     arch/x86/events/zhaoxin/core.c:362:6: note: remove the '&&' if its condition is always true
>             if (boot_cpu_data.x86 == 0x06 &&
>                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
>     arch/x86/events/zhaoxin/core.c:352:13: note: initialize the variable 'is_zxc' to silence this warning
>             bool is_zxc;
>                        ^
>                         = 0
>     2 warnings generated.]
> 
> Reported-by: kbuild test robot <lkp@intel.com>
> Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   arch/x86/events/Makefile               |   2 +
>   arch/x86/events/core.c                 |   4 +
>   arch/x86/events/perf_event.h           |  10 +
>   arch/x86/events/zhaoxin/Makefile       |   2 +
>   arch/x86/events/zhaoxin/core.c         | 613 +++++++++++++++++++++++++++++++++
>   arch/x86/kernel/cpu/perfctr-watchdog.c |   8 +
>   6 files changed, 639 insertions(+)
>   create mode 100644 arch/x86/events/zhaoxin/Makefile
>   create mode 100644 arch/x86/events/zhaoxin/core.c
> 
> diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
> index 9e07f55..6f1d1fd 100644
> --- a/arch/x86/events/Makefile
> +++ b/arch/x86/events/Makefile
> @@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
>   obj-y					+= amd/
>   obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
>   obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
> +obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
> +obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index a619763..9e63ee5 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void)
>   		err = amd_pmu_init();
>   		x86_pmu.name = "HYGON";
>   		break;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		err = zhaoxin_pmu_init();
> +		break;
>   	default:
>   		err = -ENOTSUPP;
>   	}
> diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
> index f1cd1ca..e17a3d8 100644
> --- a/arch/x86/events/perf_event.h
> +++ b/arch/x86/events/perf_event.h
> @@ -618,6 +618,7 @@ struct x86_pmu {
>   
>   	/* PMI handler bits */
>   	unsigned int	late_ack		:1,
> +			enabled_ack		:1,
>   			counter_freezing	:1;
>   	/*
>   	 * sysfs attrs
> @@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void)
>   	return 0;
>   }
>   #endif /* CONFIG_CPU_SUP_INTEL */
> +
> +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
> +int zhaoxin_pmu_init(void);
> +#else
> +static inline int zhaoxin_pmu_init(void)
> +{
> +	return 0;
> +}
> +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
> diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile
> new file mode 100644
> index 0000000..642c1174
> --- /dev/null
> +++ b/arch/x86/events/zhaoxin/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-y	+= core.o
> diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
> new file mode 100644
> index 0000000..898fa1a
> --- /dev/null
> +++ b/arch/x86/events/zhaoxin/core.c
> @@ -0,0 +1,613 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Zhoaxin PMU; like Intel Architectural PerfMon-v2
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/stddef.h>
> +#include <linux/types.h>
> +#include <linux/init.h>
> +#include <linux/slab.h>
> +#include <linux/export.h>
> +#include <linux/nmi.h>
> +
> +#include <asm/cpufeature.h>
> +#include <asm/hardirq.h>
> +#include <asm/apic.h>
> +
> +#include "../perf_event.h"
> +
> +/*
> + * Zhaoxin PerfMon, used on zxc and later.
> + */
> +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
> +
> +	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
> +	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
> +	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
> +	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
> +	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
> +};
> +
> +static struct event_constraint zxc_event_constraints[] __read_mostly = {
> +
> +	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
> +	EVENT_CONSTRAINT_END
> +};
> +
> +static struct event_constraint zxd_event_constraints[] __read_mostly = {
> +
> +	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
> +	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
> +	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
> +	EVENT_CONSTRAINT_END
> +};
> +
> +static __initconst const u64 zxd_hw_cache_event_ids
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +[C(L1D)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0042,
> +		[C(RESULT_MISS)] = 0x0538,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0043,
> +		[C(RESULT_MISS)] = 0x0562,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(L1I)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0300,
> +		[C(RESULT_MISS)] = 0x0301,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x030a,
> +		[C(RESULT_MISS)] = 0x030b,
> +	},
> +},
> +[C(LL)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(DTLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0042,
> +		[C(RESULT_MISS)] = 0x052c,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0043,
> +		[C(RESULT_MISS)] = 0x0530,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0564,
> +		[C(RESULT_MISS)] = 0x0565,
> +	},
> +},
> +[C(ITLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x00c0,
> +		[C(RESULT_MISS)] = 0x0534,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(BPU)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0700,
> +		[C(RESULT_MISS)] = 0x0709,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(NODE)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +};
> +
> +static __initconst const u64 zxe_hw_cache_event_ids
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +[C(L1D)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0568,
> +		[C(RESULT_MISS)] = 0x054b,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0669,
> +		[C(RESULT_MISS)] = 0x0562,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(L1I)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0300,
> +		[C(RESULT_MISS)] = 0x0301,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x030a,
> +		[C(RESULT_MISS)] = 0x030b,
> +	},
> +},
> +[C(LL)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0,
> +		[C(RESULT_MISS)] = 0x0,
> +	},
> +},
> +[C(DTLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0568,
> +		[C(RESULT_MISS)] = 0x052c,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = 0x0669,
> +		[C(RESULT_MISS)] = 0x0530,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = 0x0564,
> +		[C(RESULT_MISS)] = 0x0565,
> +	},
> +},
> +[C(ITLB)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x00c0,
> +		[C(RESULT_MISS)] = 0x0534,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(BPU)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = 0x0028,
> +		[C(RESULT_MISS)] = 0x0029,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +[C(NODE)] = {
> +	[C(OP_READ)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_WRITE)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +	[C(OP_PREFETCH)] = {
> +		[C(RESULT_ACCESS)] = -1,
> +		[C(RESULT_MISS)] = -1,
> +	},
> +},
> +};
> +
> +static void zhaoxin_pmu_disable_all(void)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +}
> +
> +static void zhaoxin_pmu_enable_all(int added)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
> +}
> +
> +static inline u64 zhaoxin_pmu_get_status(void)
> +{
> +	u64 status;
> +
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
> +
> +	return status;
> +}
> +
> +static inline void zhaoxin_pmu_ack_status(u64 ack)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
> +}
> +
> +static inline void zxc_pmu_ack_status(u64 ack)
> +{
> +	/*
> +	 * ZXC needs global control enabled in order to clear status bits.
> +	 */
> +	zhaoxin_pmu_enable_all(0);
> +	zhaoxin_pmu_ack_status(ack);
> +	zhaoxin_pmu_disable_all();
> +}
> +
> +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
> +{
> +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
> +	u64 ctrl_val, mask;
> +
> +	mask = 0xfULL << (idx * 4);
> +
> +	rdmsrl(hwc->config_base, ctrl_val);
> +	ctrl_val &= ~mask;
> +	wrmsrl(hwc->config_base, ctrl_val);
> +}
> +
> +static void zhaoxin_pmu_disable_event(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> +		zhaoxin_pmu_disable_fixed(hwc);
> +		return;
> +	}
> +
> +	x86_pmu_disable_event(event);
> +}
> +
> +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
> +{
> +	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
> +	u64 ctrl_val, bits, mask;
> +
> +	/*
> +	 * Enable IRQ generation (0x8),
> +	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
> +	 * if requested:
> +	 */
> +	bits = 0x8ULL;
> +	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
> +		bits |= 0x2;
> +	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
> +		bits |= 0x1;
> +
> +	bits <<= (idx * 4);
> +	mask = 0xfULL << (idx * 4);
> +
> +	rdmsrl(hwc->config_base, ctrl_val);
> +	ctrl_val &= ~mask;
> +	ctrl_val |= bits;
> +	wrmsrl(hwc->config_base, ctrl_val);
> +}
> +
> +static void zhaoxin_pmu_enable_event(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
> +		zhaoxin_pmu_enable_fixed(hwc);
> +		return;
> +	}
> +
> +	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
> +}
> +
> +/*
> + * This handler is triggered by the local APIC, so the APIC IRQ handling
> + * rules apply:
> + */
> +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
> +{
> +	struct perf_sample_data data;
> +	struct cpu_hw_events *cpuc;
> +	int handled = 0;
> +	u64 status;
> +	int bit;
> +
> +	cpuc = this_cpu_ptr(&cpu_hw_events);
> +	apic_write(APIC_LVTPC, APIC_DM_NMI);
> +	zhaoxin_pmu_disable_all();
> +	status = zhaoxin_pmu_get_status();
> +	if (!status)
> +		goto done;
> +
> +again:
> +	if (x86_pmu.enabled_ack)
> +		zxc_pmu_ack_status(status);
> +	else
> +		zhaoxin_pmu_ack_status(status);
> +
> +	inc_irq_stat(apic_perf_irqs);
> +
> +	/*
> +	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
> +	 * and clear the bit.
> +	 */
> +	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
> +		if (!status)
> +			goto done;
> +	}
> +
> +	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
> +		struct perf_event *event = cpuc->events[bit];
> +
> +		handled++;
> +
> +		if (!test_bit(bit, cpuc->active_mask))
> +			continue;
> +
> +		x86_perf_event_update(event);
> +		perf_sample_data_init(&data, 0, event->hw.last_period);
> +
> +		if (!x86_perf_event_set_period(event))
> +			continue;
> +
> +		if (perf_event_overflow(event, &data, regs))
> +			x86_pmu_stop(event, 0);
> +	}
> +
> +	/*
> +	 * Repeat if there is more work to be done:
> +	 */
> +	status = zhaoxin_pmu_get_status();
> +	if (status)
> +		goto again;
> +
> +done:
> +	zhaoxin_pmu_enable_all(0);
> +	return handled;
> +}
> +
> +static u64 zhaoxin_pmu_event_map(int hw_event)
> +{
> +	return zx_pmon_event_map[hw_event];
> +}
> +
> +static struct event_constraint *
> +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
> +			struct perf_event *event)
> +{
> +	struct event_constraint *c;
> +
> +	if (x86_pmu.event_constraints) {
> +		for_each_event_constraint(c, x86_pmu.event_constraints) {
> +			if ((event->hw.config & c->cmask) == c->code)
> +				return c;
> +		}
> +	}
> +
> +	return &unconstrained;
> +}
> +
> +PMU_FORMAT_ATTR(event,	"config:0-7");
> +PMU_FORMAT_ATTR(umask,	"config:8-15");
> +PMU_FORMAT_ATTR(edge,	"config:18");
> +PMU_FORMAT_ATTR(inv,	"config:23");
> +PMU_FORMAT_ATTR(cmask,	"config:24-31");
> +
> +static struct attribute *zx_arch_formats_attr[] = {
> +	&format_attr_event.attr,
> +	&format_attr_umask.attr,
> +	&format_attr_edge.attr,
> +	&format_attr_inv.attr,
> +	&format_attr_cmask.attr,
> +	NULL,
> +};
> +
> +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
> +{
> +	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
> +
> +	return x86_event_sysfs_show(page, config, event);
> +}
> +
> +static const struct x86_pmu zhaoxin_pmu __initconst = {
> +	.name			= "zhaoxin",
> +	.handle_irq		= zhaoxin_pmu_handle_irq,
> +	.disable_all		= zhaoxin_pmu_disable_all,
> +	.enable_all		= zhaoxin_pmu_enable_all,
> +	.enable			= zhaoxin_pmu_enable_event,
> +	.disable		= zhaoxin_pmu_disable_event,
> +	.hw_config		= x86_pmu_hw_config,
> +	.schedule_events	= x86_schedule_events,
> +	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
> +	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
> +	.event_map		= zhaoxin_pmu_event_map,
> +	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
> +	.apic			= 1,
> +	/*
> +	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
> +	 */
> +	.max_period		= (1ULL << 47) - 1,
> +	.get_event_constraints	= zhaoxin_get_event_constraints,
> +
> +	.format_attrs		= zx_arch_formats_attr,
> +	.events_sysfs_show	= zhaoxin_event_sysfs_show,
> +};
> +
> +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
> +	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
> +	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
> +	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
> +	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
> +	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
> +	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
> +	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
> +};
> +
> +static __init void zhaoxin_arch_events_quirk(void)
> +{
> +	int bit;
> +
> +	/* disable event that reported as not presend by cpuid */
> +	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
> +		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
> +		pr_warn("CPUID marked event: \'%s\' unavailable\n",
> +			zx_arch_events_map[bit].name);
> +	}
> +}
> +
> +__init int zhaoxin_pmu_init(void)
> +{
> +	union cpuid10_edx edx;
> +	union cpuid10_eax eax;
> +	union cpuid10_ebx ebx;
> +	struct event_constraint *c;
> +	unsigned int unused;
> +	int version;
> +
> +	pr_info("Welcome to zhaoxin pmu!\n");
> +
> +	/*
> +	 * Check whether the Architectural PerfMon supports
> +	 * hw_event or not.
> +	 */
> +	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
> +
> +	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
> +		return -ENODEV;
> +
> +	version = eax.split.version_id;
> +	if (version != 2)
> +		return -ENODEV;
> +
> +	x86_pmu = zhaoxin_pmu;
> +	pr_info("Version check pass!\n");
> +
> +	x86_pmu.version			= version;
> +	x86_pmu.num_counters		= eax.split.num_counters;
> +	x86_pmu.cntval_bits		= eax.split.bit_width;
> +	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
> +	x86_pmu.events_maskl		= ebx.full;
> +	x86_pmu.events_mask_len		= eax.split.mask_length;
> +
> +	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
> +	x86_add_quirk(zhaoxin_arch_events_quirk);
> +
> +	switch (boot_cpu_data.x86) {
> +	case 0x06:
> +		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
> +
> +			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
> +
> +			/* Clearing status works only if the global control is enable on zxc. */
> +			x86_pmu.enabled_ack = 1;
> +
> +			x86_pmu.event_constraints = zxc_event_constraints;
> +			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
> +			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
> +
> +			pr_cont("ZXC events, ");
> +			break;
> +		}
> +		return -ENODEV;
> +
> +	case 0x07:
> +		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
> +			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
> +
> +		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
> +			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
> +
> +		switch (boot_cpu_data.x86_model) {
> +		case 0x1b:
> +			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
> +			       sizeof(hw_cache_event_ids));
> +
> +			x86_pmu.event_constraints = zxd_event_constraints;
> +
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
> +
> +			pr_cont("ZXD events, ");
> +			break;
> +		case 0x3b:
> +			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
> +			       sizeof(hw_cache_event_ids));
> +
> +			x86_pmu.event_constraints = zxd_event_constraints;
> +
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
> +			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
> +
> +			pr_cont("ZXE events, ");
> +			break;
> +		default:
> +			return -ENODEV;
> +		}
> +		break;
> +
> +	default:
> +		return -ENODEV;
> +	}
> +
> +	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
> +	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
> +
> +	if (x86_pmu.event_constraints) {
> +		for_each_event_constraint(c, x86_pmu.event_constraints) {
> +			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
> +			c->weight += x86_pmu.num_counters;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
> index 9556930..a548d91 100644
> --- a/arch/x86/kernel/cpu/perfctr-watchdog.c
> +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
> @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
>   		case 15:
>   			return msr - MSR_P4_BPU_PERFCTR0;
>   		}
> +		break;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		return msr - MSR_ARCH_PERFMON_PERFCTR0;
>   	}
>   	return 0;
>   }
> @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
>   		case 15:
>   			return msr - MSR_P4_BSU_ESCR0;
>   		}
> +		break;
> +	case X86_VENDOR_ZHAOXIN:
> +	case X86_VENDOR_CENTAUR:
> +		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
>   	}
>   	return 0;
>   
> 

Dear All,

I sent the patch two weeks ago, but there were some problems at that 
time. Peter helped me to optimize the code and modify the patch, then I 
resend it. But I haven't received a reply, so I wonder if this is 
normal, or if I should keep waiting.

Thanks
Cody

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
@ 2020-04-19 13:10 CodyYao-oc
  2020-04-26  3:04 ` CodyYao-oc
  0 siblings, 1 reply; 15+ messages in thread
From: CodyYao-oc @ 2020-04-19 13:10 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namehyung, tglx, bp, x86, hpa, linux-kernel
  Cc: cooperyan, codyyao, CodyYao-oc

Zhaoxin CPU has provided facilities for monitoring performance
via PMU (Performance Monitor Unit), but the functionality is unused so far.
Therefore, add support for zhaoxin pmu to make performance related
hardware events available.

The PMU is mostly an Intel Architectural PerfMon-v2 with a novel
errata for the ZXC line. It supports the following events:

  -----------------------------------------------------------------------------------------------------------------------------------
  Event                      | Event  | Umask |          Description
			     | Select |       |
  -----------------------------------------------------------------------------------------------------------------------------------
  cpu-cycles                 |  82h   |  00h  | unhalt core clock
  instructions               |  00h   |  00h  | number of instructions at retirement.
  cache-references           |  15h   |  05h  | number of fillq pushs at the current cycle.
  cache-misses               |  1ah   |  05h  | number of l2 miss pushed by fillq.
  branch-instructions        |  28h   |  00h  | counts the number of branch instructions retired.
  branch-misses              |  29h   |  00h  | mispredicted branch instructions at retirement.
  bus-cycles                 |  83h   |  00h  | unhalt bus clock
  stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the # of Uops issued by the RAT to RS.
  stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
  L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
  L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose data source followed an L1 miss.
  L1-dcache-stores           |  69h   |  06h  | number of retire/commit Store,no LEA
  L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement.
  L1-icache-loads            |  00h   |  03h  | number of l1i cache access for valid normal fetch,including un-cacheable access.
  L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss for valid normal fetch,including un-cacheable miss.
  L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
  L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
  dTLB-loads                 |  68h   |  05h  | number of retire/commit load
  dTLB-load-misses           |  2ch   |  05h  | number of load operations miss all level tlbs and cause a tablewalk.
  dTLB-stores                |  69h   |  06h  | number of retire/commit Store,no LEA
  dTLB-store-misses          |  30h   |  05h  | number of store operations miss all level tlbs and cause a tablewalk.
  dTLB-prefetches            |  64h   |  05h  | number of hardware pte prefetch requests dispatched out of the prefetch FIFO.
  dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte prefetch requests miss the l1d data cache.
  iTLB-load                  |  00h   |  00h  | actually counter instructions.
  iTLB-load-misses           |  34h   |  05h  | number of code operations miss all level tlbs and cause a tablewalk.
  -----------------------------------------------------------------------------------------------------------------------------------

[All warnings (new ones prefixed by >>):

>> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
           if (boot_cpu_data.x86 == 0x06 &&
               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs here
           if (is_zxc)
               ^~~~~~
   arch/x86/events/zhaoxin/core.c:362:2: note: remove the 'if' if its condition is always true
           if (boot_cpu_data.x86 == 0x06 &&
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> arch/x86/events/zhaoxin/core.c:362:6: warning: variable 'is_zxc' is used uninitialized whenever '&&' condition is false [-Wsometimes-uninitialized]
           if (boot_cpu_data.x86 == 0x06 &&
               ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/events/zhaoxin/core.c:369:6: note: uninitialized use occurs here
           if (is_zxc)
               ^~~~~~
   arch/x86/events/zhaoxin/core.c:362:6: note: remove the '&&' if its condition is always true
           if (boot_cpu_data.x86 == 0x06 &&
               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/events/zhaoxin/core.c:352:13: note: initialize the variable 'is_zxc' to silence this warning
           bool is_zxc;
                      ^
                       = 0
   2 warnings generated.]

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/events/Makefile               |   2 +
 arch/x86/events/core.c                 |   4 +
 arch/x86/events/perf_event.h           |  10 +
 arch/x86/events/zhaoxin/Makefile       |   2 +
 arch/x86/events/zhaoxin/core.c         | 613 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perfctr-watchdog.c |   8 +
 6 files changed, 639 insertions(+)
 create mode 100644 arch/x86/events/zhaoxin/Makefile
 create mode 100644 arch/x86/events/zhaoxin/core.c

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 9e07f55..6f1d1fd 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
 obj-y					+= amd/
 obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
+obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index a619763..9e63ee5 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void)
 		err = amd_pmu_init();
 		x86_pmu.name = "HYGON";
 		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		err = zhaoxin_pmu_init();
+		break;
 	default:
 		err = -ENOTSUPP;
 	}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f1cd1ca..e17a3d8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -618,6 +618,7 @@ struct x86_pmu {
 
 	/* PMI handler bits */
 	unsigned int	late_ack		:1,
+			enabled_ack		:1,
 			counter_freezing	:1;
 	/*
 	 * sysfs attrs
@@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void)
 	return 0;
 }
 #endif /* CONFIG_CPU_SUP_INTEL */
+
+#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
+int zhaoxin_pmu_init(void);
+#else
+static inline int zhaoxin_pmu_init(void)
+{
+	return 0;
+}
+#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile
new file mode 100644
index 0000000..642c1174
--- /dev/null
+++ b/arch/x86/events/zhaoxin/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y	+= core.o
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
new file mode 100644
index 0000000..898fa1a
--- /dev/null
+++ b/arch/x86/events/zhaoxin/core.c
@@ -0,0 +1,613 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Zhoaxin PMU; like Intel Architectural PerfMon-v2
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Zhaoxin PerfMon, used on zxc and later.
+ */
+static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
+
+	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
+	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
+	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
+	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
+};
+
+static struct event_constraint zxc_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint zxd_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static __initconst const u64 zxd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x0538,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0700,
+		[C(RESULT_MISS)] = 0x0709,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static __initconst const u64 zxe_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x054b,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0028,
+		[C(RESULT_MISS)] = 0x0029,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static void zhaoxin_pmu_disable_all(void)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void zhaoxin_pmu_enable_all(int added)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static inline u64 zhaoxin_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void zhaoxin_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void zxc_pmu_ack_status(u64 ack)
+{
+	/*
+	 * ZXC needs global control enabled in order to clear status bits.
+	 */
+	zhaoxin_pmu_enable_all(0);
+	zhaoxin_pmu_ack_status(ack);
+	zhaoxin_pmu_disable_all();
+}
+
+static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+}
+
+static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int handled = 0;
+	u64 status;
+	int bit;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	zhaoxin_pmu_disable_all();
+	status = zhaoxin_pmu_get_status();
+	if (!status)
+		goto done;
+
+again:
+	if (x86_pmu.enabled_ack)
+		zxc_pmu_ack_status(status);
+	else
+		zhaoxin_pmu_ack_status(status);
+
+	inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
+	 * and clear the bit.
+	 */
+	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+		if (!status)
+			goto done;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		x86_perf_event_update(event);
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = zhaoxin_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	zhaoxin_pmu_enable_all(0);
+	return handled;
+}
+
+static u64 zhaoxin_pmu_event_map(int hw_event)
+{
+	return zx_pmon_event_map[hw_event];
+}
+
+static struct event_constraint *
+zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &unconstrained;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7");
+PMU_FORMAT_ATTR(umask,	"config:8-15");
+PMU_FORMAT_ATTR(edge,	"config:18");
+PMU_FORMAT_ATTR(inv,	"config:23");
+PMU_FORMAT_ATTR(cmask,	"config:24-31");
+
+static struct attribute *zx_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static const struct x86_pmu zhaoxin_pmu __initconst = {
+	.name			= "zhaoxin",
+	.handle_irq		= zhaoxin_pmu_handle_irq,
+	.disable_all		= zhaoxin_pmu_disable_all,
+	.enable_all		= zhaoxin_pmu_enable_all,
+	.enable			= zhaoxin_pmu_enable_event,
+	.disable		= zhaoxin_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= zhaoxin_pmu_event_map,
+	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
+	.apic			= 1,
+	/*
+	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
+	 */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= zhaoxin_get_event_constraints,
+
+	.format_attrs		= zx_arch_formats_attr,
+	.events_sysfs_show	= zhaoxin_event_sysfs_show,
+};
+
+static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void zhaoxin_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
+		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+			zx_arch_events_map[bit].name);
+	}
+}
+
+__init int zhaoxin_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	int version;
+
+	pr_info("Welcome to zhaoxin pmu!\n");
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version != 2)
+		return -ENODEV;
+
+	x86_pmu = zhaoxin_pmu;
+	pr_info("Version check pass!\n");
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_add_quirk(zhaoxin_arch_events_quirk);
+
+	switch (boot_cpu_data.x86) {
+	case 0x06:
+		if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
+
+			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+
+			/* Clearing status works only if the global control is enable on zxc. */
+			x86_pmu.enabled_ack = 1;
+
+			x86_pmu.event_constraints = zxc_event_constraints;
+			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
+
+			pr_cont("ZXC events, ");
+			break;
+		}
+		return -ENODEV;
+
+	case 0x07:
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+
+		switch (boot_cpu_data.x86_model) {
+		case 0x1b:
+			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709;
+
+			pr_cont("ZXD events, ");
+			break;
+		case 0x3b:
+			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
+			       sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029;
+
+			pr_cont("ZXE events, ");
+			break;
+		default:
+			return -ENODEV;
+		}
+		break;
+
+	default:
+		return -ENODEV;
+	}
+
+	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
+	x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
+		}
+	}
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9556930..a548d91 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BPU_PERFCTR0;
 		}
+		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_PERFCTR0;
 	}
 	return 0;
 }
@@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BSU_ESCR0;
 		}
+		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
 	}
 	return 0;
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-03-31 10:18 ` Peter Zijlstra
@ 2020-04-08  7:20   ` CodyYao-oc
  0 siblings, 0 replies; 15+ messages in thread
From: CodyYao-oc @ 2020-04-08  7:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	tglx, bp, hpa, x86, linux-kernel, cooperyan

On 2020/3/31 下午6:18, Peter Zijlstra wrote:
> On Tue, Mar 31, 2020 at 05:39:59PM +0800, CodyYao-oc wrote:
>> Zhaoxin CPU has provided facilities for monitoring performance
>> via PMU(Performance Monitor Unit), but the functionality is unused so far.
>> Therefore, add support for zhaoxin pmu to make performance related
>> hardware events available.
> 
> This looks like an Intel Architectural PMU v2 or so, is that correct? Do
> you have a link to documentation for your CPU?
> 
I'm sorry for such a late reply. Yes, the usage method is the same as 
Intel PMU v2, but there is no online document at present. For your 
convenience, provide some event descriptions as follows:

-----------------------------------------------------------------------------------------------------------------------------------
Event                      | Event  | Umask |          Description
                            | Select |       |
-----------------------------------------------------------------------------------------------------------------------------------
cpu-cycles                 |  82h   |  00h  | unhalt core clock
instructions               |  00h   |  00h  | number of instructions at 
retirement.
cache-references           |  15h   |  05h  | number of fillq pushs at 
the current cycle.
cache-misses               |  1ah   |  05h  | number of l2 miss pushed 
by fillq.
branch-instructions        |  28h   |  00h  | counts the number of 
branch instructions retired.
branch-misses              |  29h   |  00h  | mispredicted branch 
instructions at retirement.
bus-cycles                 |  83h   |  00h  | unhalt bus clock
stalled-cycles-frontend    |  01h   |  01h  | Increments each cycle the 
# of Uops issued by the RAT to RS.
stalled-cycles-backend     |  0fh   |  04h  | RS0/1/2/3/45 empty
L1-dcache-loads            |  68h   |  05h  | number of retire/commit load.
L1-dcache-load-misses      |  4bh   |  05h  | retired load uops whose 
data source followed an L1 miss.
L1-dcache-stores           |  69h   |  06h  | number of retire/commit 
Store,no LEA
L1-dcache-store-misses     |  62h   |  05h  | cache lines in M state 
evicted out of L1D due to Snoop HitM or dirty line replacement.
L1-icache-loads            |  00h   |  03h  | number of l1i cache access 
for valid normal fetch,including un-cacheable access.
L1-icache-load-misses      |  01h   |  03h  | number of l1i cache miss 
for valid normal fetch,including un-cacheable miss.
L1-icache-prefetches       |  0ah   |  03h  | number of prefetch.
L1-icache-prefetch-misses  |  0bh   |  03h  | number of prefetch miss.
dTLB-loads                 |  68h   |  05h  | number of retire/commit load
dTLB-load-misses           |  2ch   |  05h  | number of load operations 
miss all level tlbs and cause a tablewalk.
dTLB-stores                |  69h   |  06h  | number of retire/commit 
Store,no LEA
dTLB-store-misses          |  30h   |  05h  | number of store operations 
miss all level tlbs and cause a tablewalk.
dTLB-prefetches            |  64h   |  05h  | number of hardware pte 
prefetch requests dispatched out of the prefetch FIFO.
dTLB-prefetch-misses       |  65h   |  05h  | number of hardware pte 
prefetch requests miss the l1d data cache.
iTLB-load                  |  00h   |  00h  | actually counter instructions.
iTLB-load-misses           |  34h   |  05h  | number of code operations 
miss all level tlbs and cause a tablewalk.
-----------------------------------------------------------------------------------------------------------------------------------


>> +static void zhaoxin_pmu_disable_all(void)
>> +{
>> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
>> +}
>> +
>> +static void zhaoxin_pmu_enable_all(int added)
>> +{
>> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
>> +}
>> +
>> +static inline u64 zhaoxin_pmu_get_status(void)
>> +{
>> +	u64 status;
>> +
>> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
>> +
>> +	return status;
>> +}
>> +
>> +static inline void zhaoxin_pmu_ack_status(u64 ack)
>> +{
>> +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
>> +}
> 
>> +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
>> +{
>> +	struct perf_sample_data data;
>> +	struct cpu_hw_events *cpuc;
>> +	int bit;
>> +	u64 status;
>> +	bool is_zxc;
>> +	int handled = 0;
>> +
>> +	cpuc = this_cpu_ptr(&cpu_hw_events);
>> +	apic_write(APIC_LVTPC, APIC_DM_NMI);
>> +	zhaoxin_pmu_disable_all();
>> +	status = zhaoxin_pmu_get_status();
>> +	if (!status)
>> +		goto done;
>> +
>> +	if (boot_cpu_data.x86 == 0x06 &&
>> +		(boot_cpu_data.x86_model == 0x0f ||
>> +			boot_cpu_data.x86_model == 0x19))
>> +		is_zxc = true;
>> +again:
>> +
>> +	/*Clearing status works only if the global control is enable on zxc.*/
>> +	if (is_zxc)
>> +		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
>> +
>> +	zhaoxin_pmu_ack_status(status);
>> +
>> +	if (is_zxc)
>> +		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> 
> That's an unfortunate errata; perhaps write it like so:
> 

Thank you very much for your advice, I have changed the code.

> static inline void zxc_pmu_ack_status(u64 status)
> {
> 	/*
> 	 * ZXC needs global control enabled in order to clear status bits.
> 	 */
> 	zhaoxin_pmu_enable_all(0);
> 	zhaoxin_pmu_ack_status(status);
> 	zhaoxin_pmu_disable_all();
> }
> 
> 	if (is_zxc)
> 		zxc_pmu_ack_status(status);
> 	else
> 		zhaoxin_pmu_ack_status(status);
> 
> Alternatively; you can do a whole zxc specific handle_irq() and move the
> get/ack status before disable_all(). If you do that, then factor this:
> 
>> +	/*
>> +	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
>> +	 * and clear the bit.
>> +	 */
>> +	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
>> +		if (!status)
>> +			goto done;
>> +	}
>> +
>> +	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
>> +		struct perf_event *event = cpuc->events[bit];
>> +
>> +		handled++;
>> +
>> +		if (!test_bit(bit, cpuc->active_mask))
>> +			continue;
>> +
>> +		x86_perf_event_update(event);
>> +		perf_sample_data_init(&data, 0, event->hw.last_period);
>> +
>> +		if (!x86_perf_event_set_period(event))
>> +			continue;
>> +
>> +		if (perf_event_overflow(event, &data, regs))
>> +			x86_pmu_stop(event, 0);
>> +	}
> 
> bit into it's own function so you don't have to duplicate it. Then the
> two handle_irq() functions only differ in the status handling.
> 
>> +
>> +	/*
>> +	 * Repeat if there is more work to be done:
>> +	 */
>> +	status = zhaoxin_pmu_get_status();
>> +	if (status)
>> +		goto again;
>> +
>> +done:
>> +	zhaoxin_pmu_enable_all(0);
>> +	return handled;
>> +}
> .
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
  2020-03-31  9:39 CodyYao-oc
@ 2020-03-31 10:18 ` Peter Zijlstra
  2020-04-08  7:20   ` CodyYao-oc
  0 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2020-03-31 10:18 UTC (permalink / raw)
  To: CodyYao-oc
  Cc: mingo, acme, mark.rutland, alexander.shishkin, jolsa, namhyung,
	tglx, bp, hpa, x86, linux-kernel, cooperyan

On Tue, Mar 31, 2020 at 05:39:59PM +0800, CodyYao-oc wrote:
> Zhaoxin CPU has provided facilities for monitoring performance
> via PMU(Performance Monitor Unit), but the functionality is unused so far.
> Therefore, add support for zhaoxin pmu to make performance related
> hardware events available.

This looks like an Intel Architectural PMU v2 or so, is that correct? Do
you have a link to documentation for your CPU?

> +static void zhaoxin_pmu_disable_all(void)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
> +}
> +
> +static void zhaoxin_pmu_enable_all(int added)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
> +}
> +
> +static inline u64 zhaoxin_pmu_get_status(void)
> +{
> +	u64 status;
> +
> +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
> +
> +	return status;
> +}
> +
> +static inline void zhaoxin_pmu_ack_status(u64 ack)
> +{
> +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
> +}

> +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
> +{
> +	struct perf_sample_data data;
> +	struct cpu_hw_events *cpuc;
> +	int bit;
> +	u64 status;
> +	bool is_zxc;
> +	int handled = 0;
> +
> +	cpuc = this_cpu_ptr(&cpu_hw_events);
> +	apic_write(APIC_LVTPC, APIC_DM_NMI);
> +	zhaoxin_pmu_disable_all();
> +	status = zhaoxin_pmu_get_status();
> +	if (!status)
> +		goto done;
> +
> +	if (boot_cpu_data.x86 == 0x06 &&
> +		(boot_cpu_data.x86_model == 0x0f ||
> +			boot_cpu_data.x86_model == 0x19))
> +		is_zxc = true;
> +again:
> +
> +	/*Clearing status works only if the global control is enable on zxc.*/
> +	if (is_zxc)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
> +
> +	zhaoxin_pmu_ack_status(status);
> +
> +	if (is_zxc)
> +		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);

That's an unfortunate errata; perhaps write it like so:

static inline void zxc_pmu_ack_status(u64 status)
{
	/*
	 * ZXC needs global control enabled in order to clear status bits.
	 */
	zhaoxin_pmu_enable_all(0);
	zhaoxin_pmu_ack_status(status);
	zhaoxin_pmu_disable_all();
}

	if (is_zxc)
		zxc_pmu_ack_status(status);
	else
		zhaoxin_pmu_ack_status(status);

Alternatively; you can do a whole zxc specific handle_irq() and move the
get/ack status before disable_all(). If you do that, then factor this:

> +	/*
> +	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
> +	 * and clear the bit.
> +	 */
> +	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
> +		if (!status)
> +			goto done;
> +	}
> +
> +	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
> +		struct perf_event *event = cpuc->events[bit];
> +
> +		handled++;
> +
> +		if (!test_bit(bit, cpuc->active_mask))
> +			continue;
> +
> +		x86_perf_event_update(event);
> +		perf_sample_data_init(&data, 0, event->hw.last_period);
> +
> +		if (!x86_perf_event_set_period(event))
> +			continue;
> +
> +		if (perf_event_overflow(event, &data, regs))
> +			x86_pmu_stop(event, 0);
> +	}

bit into it's own function so you don't have to duplicate it. Then the
two handle_irq() functions only differ in the status handling.

> +
> +	/*
> +	 * Repeat if there is more work to be done:
> +	 */
> +	status = zhaoxin_pmu_get_status();
> +	if (status)
> +		goto again;
> +
> +done:
> +	zhaoxin_pmu_enable_all(0);
> +	return handled;
> +}

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU.
@ 2020-03-31  9:39 CodyYao-oc
  2020-03-31 10:18 ` Peter Zijlstra
  0 siblings, 1 reply; 15+ messages in thread
From: CodyYao-oc @ 2020-03-31  9:39 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, tglx, bp, hpa, x86, linux-kernel
  Cc: cooperyan, CodyYao-oc

Zhaoxin CPU has provided facilities for monitoring performance
via PMU(Performance Monitor Unit), but the functionality is unused so far.
Therefore, add support for zhaoxin pmu to make performance related
hardware events available.

Signed-off-by: CodyYao-oc <CodyYao-oc@zhaoxin.com>
---
 arch/x86/events/Makefile               |   2 +
 arch/x86/events/core.c                 |   4 +
 arch/x86/events/perf_event.h           |   9 +
 arch/x86/events/zhaoxin/Makefile       |   2 +
 arch/x86/events/zhaoxin/core.c         | 618 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perfctr-watchdog.c |   6 +
 6 files changed, 641 insertions(+)
 create mode 100644 arch/x86/events/zhaoxin/Makefile
 create mode 100644 arch/x86/events/zhaoxin/core.c

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 9e07f55..6f1d1fd 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -3,3 +3,5 @@ obj-y					+= core.o probe.o
 obj-y					+= amd/
 obj-$(CONFIG_X86_LOCAL_APIC)            += msr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
+obj-$(CONFIG_CPU_SUP_CENTAUR)		+= zhaoxin/
+obj-$(CONFIG_CPU_SUP_ZHAOXIN)		+= zhaoxin/
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 3bb738f..e76048e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void)
 		err = amd_pmu_init();
 		x86_pmu.name = "HYGON";
 		break;
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		err = zhaoxin_pmu_init();
+		break;
 	default:
 		err = -ENOTSUPP;
 	}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f1cd1ca..f6bbdca 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1133,3 +1133,12 @@ static inline int is_ht_workaround_enabled(void)
 	return 0;
 }
 #endif /* CONFIG_CPU_SUP_INTEL */
+
+#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN))
+int zhaoxin_pmu_init(void);
+#else
+static inline int zhaoxin_pmu_init(void)
+{
+	return 0;
+}
+#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile
new file mode 100644
index 0000000..642c1174
--- /dev/null
+++ b/arch/x86/events/zhaoxin/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y	+= core.o
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
new file mode 100644
index 0000000..442251a
--- /dev/null
+++ b/arch/x86/events/zhaoxin/core.c
@@ -0,0 +1,618 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Per cpu state
+ *
+ * Used to coordinate shared registers among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Zhaoxin PerfMon, used on zxc and later.
+ */
+static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = {
+
+	[PERF_COUNT_HW_CPU_CYCLES]        = 0x0082,
+	[PERF_COUNT_HW_INSTRUCTIONS]      = 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]  = 0x0515,
+	[PERF_COUNT_HW_CACHE_MISSES]      = 0x051a,
+	[PERF_COUNT_HW_BUS_CYCLES]        = 0x0083,
+};
+
+static struct event_constraint zxc_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint zxd_event_constraints[] __read_mostly = {
+
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */
+	FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */
+	FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */
+	EVENT_CONSTRAINT_END
+};
+
+static __initconst const u64 zxd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x0538,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0042,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0043,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0700,
+		[C(RESULT_MISS)] = 0x0709,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static __initconst const u64 zxe_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x054b,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0562,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0300,
+		[C(RESULT_MISS)] = 0x0301,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x030a,
+		[C(RESULT_MISS)] = 0x030b,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0,
+		[C(RESULT_MISS)] = 0x0,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0568,
+		[C(RESULT_MISS)] = 0x052c,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x0669,
+		[C(RESULT_MISS)] = 0x0530,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = 0x0564,
+		[C(RESULT_MISS)] = 0x0565,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x00c0,
+		[C(RESULT_MISS)] = 0x0534,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x0028,
+		[C(RESULT_MISS)] = 0x0029,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = -1,
+		[C(RESULT_MISS)] = -1,
+	},
+},
+};
+
+static void zhaoxin_pmu_disable_all(void)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void zhaoxin_pmu_enable_all(int added)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static inline u64 zhaoxin_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void zhaoxin_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+}
+
+static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void zhaoxin_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		zhaoxin_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int zhaoxin_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int bit;
+	u64 status;
+	bool is_zxc;
+	int handled = 0;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	zhaoxin_pmu_disable_all();
+	status = zhaoxin_pmu_get_status();
+	if (!status)
+		goto done;
+
+	if (boot_cpu_data.x86 == 0x06 &&
+		(boot_cpu_data.x86_model == 0x0f ||
+			boot_cpu_data.x86_model == 0x19))
+		is_zxc = true;
+again:
+
+	/*Clearing status works only if the global control is enable on zxc.*/
+	if (is_zxc)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+	zhaoxin_pmu_ack_status(status);
+
+	if (is_zxc)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+	inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
+	 * and clear the bit.
+	 */
+	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+		if (!status)
+			goto done;
+	}
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		x86_perf_event_update(event);
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = zhaoxin_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	zhaoxin_pmu_enable_all(0);
+	return handled;
+}
+
+static u64 zhaoxin_pmu_event_map(int hw_event)
+{
+	return zx_pmon_event_map[hw_event];
+}
+
+static struct event_constraint *
+zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &unconstrained;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7");
+PMU_FORMAT_ATTR(umask,	"config:8-15");
+PMU_FORMAT_ATTR(edge,	"config:18");
+PMU_FORMAT_ATTR(inv,	"config:23");
+PMU_FORMAT_ATTR(cmask,	"config:24-31");
+
+static struct attribute *zx_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static const struct x86_pmu zhaoxin_pmu __initconst = {
+	.name			= "zhaoxin",
+	.handle_irq		= zhaoxin_pmu_handle_irq,
+	.disable_all		= zhaoxin_pmu_disable_all,
+	.enable_all		= zhaoxin_pmu_enable_all,
+	.enable			= zhaoxin_pmu_enable_event,
+	.disable		= zhaoxin_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= zhaoxin_pmu_event_map,
+	.max_events		= ARRAY_SIZE(zx_pmon_event_map),
+	.apic			= 1,
+	/*
+	 * For zxd/zxe, read/write operation for PMCx MSR is 48 bits.
+	 */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= zhaoxin_get_event_constraints,
+
+	.format_attrs		= zx_arch_formats_attr,
+	.events_sysfs_show	= zhaoxin_event_sysfs_show,
+};
+
+static const struct { int id; char *name; } zx_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void zhaoxin_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask,
+			ARRAY_SIZE(zx_arch_events_map)) {
+
+		zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+				zx_arch_events_map[bit].name);
+	}
+}
+
+__init int zhaoxin_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	int version;
+
+	pr_info("Welcome to zhaoxin pmu!\n");
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version == 2) {
+		x86_pmu = zhaoxin_pmu;
+		pr_info("Version check pass!\n");
+	} else
+		return -ENODEV;
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_add_quirk(zhaoxin_arch_events_quirk);
+
+	switch (boot_cpu_data.x86) {
+	case 0x06:
+		if (boot_cpu_data.x86_model == 0x0f ||
+			boot_cpu_data.x86_model == 0x19) {
+
+			x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+
+			x86_pmu.event_constraints = zxc_event_constraints;
+			zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0;
+			zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0;
+
+			pr_cont("ZXC events, ");
+		} else
+			return -ENODEV;
+		break;
+	case 0x07:
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01);
+
+		zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0);
+
+		switch (boot_cpu_data.x86_model) {
+		case 0x1b:
+			memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids,
+				sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
+				= 0x0700;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
+				= 0x0709;
+
+			pr_cont("ZXD events, ");
+			break;
+		case 0x3b:
+			memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids,
+				sizeof(hw_cache_event_ids));
+
+			x86_pmu.event_constraints = zxd_event_constraints;
+
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]
+				= 0x0028;
+			zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES]
+				= 0x0029;
+
+			pr_cont("ZXE events, ");
+			break;
+		default:
+			return -ENODEV;
+		}
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
+	x86_pmu.intel_ctrl |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
+		}
+	}
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9556930..63a5828 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -63,6 +63,9 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BPU_PERFCTR0;
 		}
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_PERFCTR0;
 	}
 	return 0;
 }
@@ -92,6 +95,9 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
 		case 15:
 			return msr - MSR_P4_BSU_ESCR0;
 		}
+	case X86_VENDOR_ZHAOXIN:
+	case X86_VENDOR_CENTAUR:
+		return msr - MSR_ARCH_PERFMON_EVENTSEL0;
 	}
 	return 0;
 
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2020-05-06 12:56 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-13  3:14 [PATCH] x86/perf: Add hardware performance events support for Zhaoxin CPU CodyYao-oc
2020-04-15  9:36 ` Borislav Petkov
2020-04-16  6:16   ` CodyYao-oc
2020-04-15 10:23 ` Peter Zijlstra
2020-04-15 10:31   ` Peter Zijlstra
2020-04-16  7:36     ` CodyYao-oc
2020-05-06 12:55     ` CodyYao-oc
2020-04-16  6:26   ` CodyYao-oc
2020-05-01 18:22 ` [tip: perf/core] " tip-bot2 for CodyYao-oc
  -- strict thread matches above, loose matches on Subject: below --
2020-04-27 12:30 [PATCH] " CodyYao-oc
2020-04-19 13:10 CodyYao-oc
2020-04-26  3:04 ` CodyYao-oc
2020-03-31  9:39 CodyYao-oc
2020-03-31 10:18 ` Peter Zijlstra
2020-04-08  7:20   ` CodyYao-oc

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).