All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
@ 2010-12-02  5:20 Lin Ming
  2010-12-02  5:57 ` Lin Ming
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: Lin Ming @ 2010-12-02  5:20 UTC (permalink / raw)
  To: Peter Zijlstra, Stephane Eranian, Andi Kleen
  Cc: Ingo Molnar, Frederic Weisbecker, Arjan van de Ven, lkml

Changelogs of v3:

- Allocate uncore data with kmalloc_node, like AMD NB stuff. (Peter
Zijlstra)

- per-task uncore event is not allowed. Simply set pmu::task_ctx_nr =
perf_invalid_context. (Peter Zijlstra)

- Route interrupts to the first core that accesses uncore pmu. (Stephane
Eranian)

- Check CPUID signatures with boot_cpu_data. (Andi Kleen)

- Remove unneeded include files. (Andi Kleen)

For the background of Nehalem uncore pmu, see Intel SDM Volume 3B
"30.6.2 Performance Monitoring Facility in the Uncore"

1. data structure

struct intel_uncore {
        int id;  /* uncore id */
        int refcnt; /* reference count */

        struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
        unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        int n_events;
        int nmi_core; /* the core to handle NMI */
        struct spinlock lock;
};

struct intel_uncore is the per socket structure, which is allocated like
AMD NB stuff.
"lock" protects add/delete events to uncore pmu.

2. Uncore pmu NMI handling

As suggested by Stephane and Peter, all interrupts are routed to the
first core that accesses the uncore pmu. 

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
 arch/x86/include/asm/msr-index.h              |    1 +
 arch/x86/include/asm/perf_event.h             |    5 +
 arch/x86/kernel/cpu/Makefile                  |    1 +
 arch/x86/kernel/cpu/perf_event.c              |    6 +-
 arch/x86/kernel/cpu/perf_event_intel_uncore.c |  543 +++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel_uncore.h |   69 ++++
 include/linux/perf_event.h                    |    1 +
 7 files changed, 621 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.c
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_uncore.h

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6b89f5e..a1cc40b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,6 +81,7 @@
 #define DEBUGCTLMSR_BTS_OFF_OS		(1UL <<  9)
 #define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
 #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
+#define DEBUGCTLMSR_ENABLE_UNCORE_PMI	(1UL << 13)
 
 #define MSR_IA32_MC0_CTL		0x00000400
 #define MSR_IA32_MC0_STATUS		0x00000401
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index d9d4dae..ab5d0bb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -75,6 +75,10 @@ union cpuid10_edx {
 	unsigned int full;
 };
 
+struct pmu_nmi_state {
+	unsigned int	marked;
+	int		handled;
+};
 
 /*
  * Fixed-purpose performance events:
@@ -126,6 +130,7 @@ union cpuid10_edx {
 
 #ifdef CONFIG_PERF_EVENTS
 extern void perf_events_lapic_init(void);
+extern void init_uncore_pmu(void);
 
 #define PERF_EVENT_INDEX_OFFSET			0
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe4..db4bf99 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
+obj-$(CONFIG_PERF_EVENTS)		+= perf_event_intel_uncore.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7202762..243805e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1210,11 +1210,6 @@ void perf_events_lapic_init(void)
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
-struct pmu_nmi_state {
-	unsigned int	marked;
-	int		handled;
-};
-
 static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
 
 static int __kprobes
@@ -1362,6 +1357,7 @@ int __init init_hw_perf_events(void)
 
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
+		init_uncore_pmu();
 		err = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
new file mode 100644
index 0000000..d2c10d8
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -0,0 +1,543 @@
+#include "perf_event_intel_uncore.h"
+
+static DEFINE_PER_CPU(struct cpu_uncore_events, cpu_uncore_events);
+static DEFINE_RAW_SPINLOCK(intel_uncore_lock);
+
+static bool uncore_pmu_initialized;
+static atomic_t active_uncore_events;
+
+static void uncore_pmu_enable_fixed_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, MSR_UNCORE_FIXED_EN | MSR_UNCORE_FIXED_PMI);
+}
+
+static void uncore_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx == UNCORE_FIXED_EVENT_IDX)
+		uncore_pmu_enable_fixed_event(event);
+	else
+		wrmsrl(hwc->config_base + hwc->idx, hwc->config | UNCORE_EVENTSEL_ENABLE);
+}
+
+static void uncore_pmu_disable_fixed_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, 0);
+}
+
+static void uncore_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx == UNCORE_FIXED_EVENT_IDX)
+		uncore_pmu_disable_fixed_event(event);
+	else
+		wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+}
+
+static void uncore_pmu_enable_all(int nmi_core)
+{
+	u64 ctrl;
+
+	ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
+
+	/* Route all interrupts to the first core that accesses uncore */
+	ctrl |= 1ULL << (48 + nmi_core);
+
+	wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
+}
+
+static void uncore_pmu_disable_all(void)
+{
+	wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void uncore_perf_event_destroy(struct perf_event *event)
+{
+	atomic_dec(&active_uncore_events);
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!uncore_pmu_initialized)
+		return -ENOENT;
+
+	switch (event->attr.type) {
+	case PERF_TYPE_UNCORE:
+		/*
+		 * Uncore PMU does measure at all privilege level all the time.
+		 * So it doesn't make sense to specify any exclude bits.
+		 */
+		if (event->attr.exclude_user || event->attr.exclude_kernel
+			|| event->attr.exclude_hv || event->attr.exclude_idle)
+			return -ENOENT;
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = (1ULL << UNCORE_CNTVAL_BITS) - 1;
+		hwc->last_period = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	atomic_inc(&active_uncore_events);
+
+	event->destroy = uncore_perf_event_destroy;
+
+	hwc->idx = -1;
+	hwc->config = (event->attr.config & UNCORE_RAW_EVENT_MASK) | UNCORE_EVENTSEL_PMI;
+	if ((hwc->config & UNCORE_EVENTSEL_EVENT) == UNCORE_FIXED_EVENT) {
+		hwc->config_base = MSR_UNCORE_FIXED_CTR_CTRL;
+		hwc->event_base = MSR_UNCORE_FIXED_CTR0;
+	} else {
+		hwc->config_base = MSR_UNCORE_PERFEVTSEL0;
+		hwc->event_base = MSR_UNCORE_PMC0;
+	}
+
+	return 0;
+}
+
+static int
+uncore_perf_event_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	u64 max_period = (1ULL << UNCORE_CNTVAL_BITS) - 1;
+	int ret = 0, idx = hwc->idx;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (left > max_period)
+		left = max_period;
+
+	/*
+	 * The hw event starts counting from this event offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	local64_set(&hwc->prev_count, (u64)-left);
+
+	if (idx == UNCORE_FIXED_EVENT_IDX)
+		idx = 0;
+	wrmsrl(hwc->event_base + idx, (u64)(-left) & max_period);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+static void uncore_pmu_start(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_RELOAD)
+		uncore_perf_event_set_period(event);
+
+	uncore_pmu_enable_event(event);
+
+	perf_event_update_userpage(event);
+}
+
+static void uncore_pmu_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	uncore_pmu_disable_event(event);
+
+	if (flags & PERF_EF_UPDATE) {
+		if (idx == UNCORE_FIXED_EVENT_IDX)
+			hwc->idx = 0;
+		x86_perf_event_update(event, UNCORE_CNTVAL_BITS);
+		hwc->idx = idx;
+	}
+}
+
+static int uncore_pmu_add(struct perf_event *event, int flags)
+{
+	struct cpu_uncore_events *cpuc = &__get_cpu_var(cpu_uncore_events);
+	struct intel_uncore *uncore = cpuc->intel_uncore;
+	int nmi_core;
+	int ret = 1;
+	int i = 0, fixed = 0;
+
+	spin_lock(&uncore->lock);
+
+	if ((event->attr.config & UNCORE_EVENTSEL_EVENT) == UNCORE_FIXED_EVENT) {
+		i = UNCORE_FIXED_EVENT_IDX;
+		fixed = 1;
+	}
+	for (; i < X86_PMC_IDX_MAX; i++) {
+		if (!fixed && i == UNCORE_NUM_GENERAL_COUNTERS)
+			break;
+		if (!uncore->events[i]) {
+			uncore->events[i] = event;
+			uncore->n_events++;
+
+			event->hw.idx = i;
+			__set_bit(i, uncore->active_mask);
+			if (flags & PERF_EF_START)
+				uncore_pmu_start(event, PERF_EF_RELOAD);
+			ret = 0;
+			break;
+		}
+
+		if (i == UNCORE_FIXED_EVENT_IDX)
+			break;
+	}
+
+	if (uncore->n_events == 1) {
+		nmi_core = topology_core_id(raw_smp_processor_id());
+		uncore->nmi_core = nmi_core;
+		uncore_pmu_enable_all(nmi_core);
+	}
+
+	spin_unlock(&uncore->lock);
+
+	return ret;
+}
+
+static void uncore_pmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_uncore_events *cpuc = &__get_cpu_var(cpu_uncore_events);
+	struct intel_uncore *uncore = cpuc->intel_uncore;
+	struct hw_perf_event *hwc = &event->hw;
+	int i;
+
+	spin_lock(&uncore->lock);
+
+	for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+		if (uncore->events[i] == event) {
+			uncore->events[hwc->idx] = NULL;
+			uncore->n_events--;
+
+			__clear_bit(i, uncore->active_mask);
+			uncore_pmu_stop(event, PERF_EF_UPDATE);
+			break;
+		}
+	}
+
+	if (uncore->n_events == 0)
+		uncore_pmu_disable_all();
+
+	spin_unlock(&uncore->lock);
+}
+
+static void uncore_pmu_read(struct perf_event *event)
+{
+	x86_perf_event_update(event, UNCORE_CNTVAL_BITS);
+}
+
+static struct pmu uncore_pmu = {
+	.event_init	= uncore_pmu_event_init,
+	.add		= uncore_pmu_add,
+	.del		= uncore_pmu_del,
+	.start		= uncore_pmu_start,
+	.stop		= uncore_pmu_stop,
+	.read		= uncore_pmu_read,
+};
+
+
+static inline u64 uncore_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_UNCORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void uncore_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static int uncore_pmu_save_and_restart(struct perf_event *event)
+{
+	x86_perf_event_update(event, UNCORE_CNTVAL_BITS);
+	return uncore_perf_event_set_period(event);
+}
+
+static int uncore_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct cpu_uncore_events *cpuc = &__get_cpu_var(cpu_uncore_events);
+	struct intel_uncore *uncore = cpuc->intel_uncore;
+	struct perf_sample_data data;
+	int bit;
+	u64 status;
+	int handled = 0;
+
+	uncore_pmu_disable_all();
+
+	status = uncore_pmu_get_status();
+	if (!status) {
+		uncore_pmu_enable_all(uncore->nmi_core);
+		return 0;
+	}
+
+again:
+	uncore_pmu_ack_status(status);
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = uncore->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, uncore->active_mask))
+			continue;
+
+		if (!uncore_pmu_save_and_restart(event))
+			continue;
+
+		data.period = event->hw.last_period;
+
+		if (perf_event_overflow(event, 1, &data, regs))
+			uncore_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = uncore_pmu_get_status();
+	if (status)
+		goto again;
+
+	uncore_pmu_enable_all(uncore->nmi_core);
+	return handled;
+}
+
+/* Copy from perf_event_nmi_handler */
+
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_uncore_nmi);
+
+static int __kprobes
+perf_event_uncore_nmi_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	unsigned int this_nmi;
+	int handled;
+
+	if (!atomic_read(&active_uncore_events))
+		return NOTIFY_DONE;
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
+	case DIE_NMIUNKNOWN:
+		this_nmi = percpu_read(irq_stat.__nmi_count);
+		if (this_nmi != __get_cpu_var(pmu_uncore_nmi).marked)
+			/* let the kernel handle the unknown nmi */
+			return NOTIFY_DONE;
+		/*
+		 * This one is a PMU back-to-back nmi. Two events
+		 * trigger 'simultaneously' raising two back-to-back
+		 * NMIs. If the first NMI handles both, the latter
+		 * will be empty and daze the CPU. So, we drop it to
+		 * avoid false-positive 'unknown nmi' messages.
+		 */
+		return NOTIFY_STOP;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	handled = uncore_pmu_handle_irq(args->regs);
+	if (!handled)
+		return NOTIFY_DONE;
+
+	this_nmi = percpu_read(irq_stat.__nmi_count);
+	if ((handled > 1) ||
+		/* the next nmi could be a back-to-back nmi */
+	    ((__get_cpu_var(pmu_uncore_nmi).marked == this_nmi) &&
+	     (__get_cpu_var(pmu_uncore_nmi).handled > 1))) {
+		/*
+		 * We could have two subsequent back-to-back nmis: The
+		 * first handles more than one counter, the 2nd
+		 * handles only one counter and the 3rd handles no
+		 * counter.
+		 *
+		 * This is the 2nd nmi because the previous was
+		 * handling more than one counter. We will mark the
+		 * next (3rd) and then drop it if unhandled.
+		 */
+		__get_cpu_var(pmu_uncore_nmi).marked	= this_nmi + 1;
+		__get_cpu_var(pmu_uncore_nmi).handled	= handled;
+	}
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_event_uncore_nmi_notifier = {
+	.notifier_call		= perf_event_uncore_nmi_handler,
+	.next			= NULL,
+	.priority		= 1
+};
+
+static struct intel_uncore *alloc_uncore(int cpu, int uncore_id)
+{
+	struct intel_uncore *uncore;
+
+	uncore = kmalloc_node(sizeof(struct intel_uncore), GFP_KERNEL | __GFP_ZERO,
+			  cpu_to_node(cpu));
+	if (!uncore)
+		return NULL;
+
+	uncore->id = uncore_id;
+	spin_lock_init(&uncore->lock);
+
+	return uncore;
+}
+
+static int uncore_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_uncore_events *cpuc = &per_cpu(cpu_uncore_events, cpu);
+
+	WARN_ON_ONCE(cpuc->intel_uncore);
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return NOTIFY_OK;
+
+	cpuc->intel_uncore = alloc_uncore(cpu, -1);
+	if (!cpuc->intel_uncore)
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+static void uncore_pmu_cpu_starting(int cpu)
+{
+	struct cpu_uncore_events *cpuc = &per_cpu(cpu_uncore_events, cpu);
+	struct intel_uncore *uncore;
+	int i, uncore_id;
+	u64 val;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	/*
+	 * PMI delivery due to an uncore counter overflow is enabled by
+	 * setting IA32_DEBUG_CTL.Offcore_PMI_EN to 1.
+	 */
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	uncore_id = topology_physical_package_id(cpu);
+	WARN_ON_ONCE(uncore_id == BAD_APICID);
+
+	raw_spin_lock(&intel_uncore_lock);
+
+	for_each_online_cpu(i) {
+		uncore = per_cpu(cpu_uncore_events, i).intel_uncore;
+		if (WARN_ON_ONCE(!uncore))
+			continue;
+
+		if (uncore->id == uncore_id) {
+			kfree(cpuc->intel_uncore);
+			cpuc->intel_uncore = uncore;
+			break;
+		}
+	}
+
+	cpuc->intel_uncore->id = uncore_id;
+	cpuc->intel_uncore->refcnt++;
+
+	raw_spin_unlock(&intel_uncore_lock);
+}
+
+static void uncore_pmu_cpu_dead(int cpu)
+{
+	struct cpu_uncore_events *cpuhw;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	cpuhw = &per_cpu(cpu_uncore_events, cpu);
+
+	raw_spin_lock(&intel_uncore_lock);
+
+	if (cpuhw->intel_uncore) {
+		struct intel_uncore *uncore = cpuhw->intel_uncore;
+
+		if (uncore->id == -1 || --uncore->refcnt == 0)
+			kfree(uncore);
+
+		cpuhw->intel_uncore = NULL;
+	}
+
+	raw_spin_unlock(&intel_uncore_lock);
+}
+
+static int __cpuinit
+uncore_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		ret = uncore_pmu_cpu_prepare(cpu);
+		break;
+
+	case CPU_STARTING:
+		uncore_pmu_cpu_starting(cpu);
+		break;
+
+	case CPU_DYING:
+		uncore_pmu_cpu_dead(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+void __init init_uncore_pmu(void)
+{
+	u8 family, model;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return;
+
+	/* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
+	family = boot_cpu_data.x86;
+	model = boot_cpu_data.x86_model;
+	if (family != 6 || (model != 0x1A && model != 0x1E && model != 0x1F))
+		return;
+
+	pr_cont("Nehalem uncore pmu, ");
+
+	perf_pmu_register(&uncore_pmu);
+	register_die_notifier(&perf_event_uncore_nmi_notifier);
+	perf_cpu_notifier(uncore_pmu_notifier);
+	uncore_pmu_initialized = true;
+	return;
+}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
new file mode 100644
index 0000000..03266a1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -0,0 +1,69 @@
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+#include <linux/slab.h>
+
+#define MSR_UNCORE_PERF_GLOBAL_CTRL	0x391
+#define MSR_UNCORE_PERF_GLOBAL_STATUS	0x392
+#define MSR_UNCORE_PERF_GLOBAL_OVF_CTRL	0x393
+#define MSR_UNCORE_FIXED_CTR0		0x394
+#define MSR_UNCORE_FIXED_CTR_CTRL	0x395
+#define MSR_UNCORE_ADDR_OPCODE_MATCH	0x396
+
+#define MSR_UNCORE_PMC0			0x3b0
+
+#define MSR_UNCORE_PERFEVTSEL0		0x3c0
+
+#define MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0 (1ULL << 32)
+#define MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_CORE0 (1ULL << 48)
+#define MSR_UNCORE_PERF_GLOBAL_CTRL_PMI_FRZ (1ULL << 63)
+
+#define MSR_UNCORE_PERF_GLOBAL_STATUS_OVF_PMI	(1ULL << 61)
+#define MSR_UNCORE_PERF_GLOBAL_STATUS_CHG       (1ULL << 63)
+
+#define MSR_UNCORE_FIXED_EN	(1ULL << 0)
+#define MSR_UNCORE_FIXED_PMI	(1ULL << 2)
+
+#define UNCORE_EVENTSEL_EVENT			0x000000FFULL
+#define UNCORE_EVENTSEL_UMASK			0x0000FF00ULL
+#define UNCORE_EVENTSEL_OCC_CTR_RST		(1ULL << 17)
+#define UNCORE_EVENTSEL_EDGE			(1ULL << 18)
+#define UNCORE_EVENTSEL_PMI			(1ULL << 20)
+#define UNCORE_EVENTSEL_ENABLE			(1ULL << 22)
+#define UNCORE_EVENTSEL_INV			(1ULL << 23)
+#define UNCORE_EVENTSEL_CMASK			0xFF000000ULL
+
+#define UNCORE_RAW_EVENT_MASK		\
+	(UNCORE_EVENTSEL_EVENT |	\
+	 UNCORE_EVENTSEL_UMASK |	\
+	 UNCORE_EVENTSEL_EDGE  |	\
+	 UNCORE_EVENTSEL_INV   |	\
+	 UNCORE_EVENTSEL_CMASK)
+
+#define UNCORE_CNTVAL_BITS	48
+
+/* 8 general purpose counters + 1 fixed-function counter */
+#define UNCORE_NUM_GENERAL_COUNTERS 8
+#define UNCORE_NUM_FIXED_COUNTERS 1
+#define UNCORE_NUM_COUNTERS (UNCORE_NUM_GENERAL_COUNTERS + UNCORE_NUM_FIXED_COUNTERS)
+
+/* TBD: fix event config value passed by userspace */
+#define UNCORE_FIXED_EVENT 0xFF
+#define UNCORE_FIXED_EVENT_IDX 32
+
+struct intel_uncore {
+	int id;  /* uncore id */
+	int refcnt; /* reference count */
+
+	struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
+	unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	int n_events;
+	int nmi_core; /* the core to handle NMI */
+	struct spinlock lock;
+};
+
+struct cpu_uncore_events {
+	struct intel_uncore *intel_uncore;
+};
+
+extern u64 x86_perf_event_update(struct perf_event *event, int cntval_bits);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index adf6d99..cf2cb49 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
+	PERF_TYPE_UNCORE			= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
-- 
1.5.3





^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-02  5:20 [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu Lin Ming
@ 2010-12-02  5:57 ` Lin Ming
  2010-12-07  6:15 ` Lin Ming
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Lin Ming @ 2010-12-02  5:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> Changelogs of v3:
> 
> - Allocate uncore data with kmalloc_node, like AMD NB stuff. (Peter
> Zijlstra)
> 
> - per-task uncore event is not allowed. Simply set pmu::task_ctx_nr =
> perf_invalid_context. (Peter Zijlstra)

<snip>

> +
> +static struct pmu uncore_pmu = {
> +	.event_init	= uncore_pmu_event_init,
> +	.add		= uncore_pmu_add,
> +	.del		= uncore_pmu_del,
> +	.start		= uncore_pmu_start,
> +	.stop		= uncore_pmu_stop,
> +	.read		= uncore_pmu_read,
> +};

Sorry, I send out an old version, need below additional code to disallow
per-task uncore event.

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index d2c10d8..d2a22ba 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -252,6 +252,8 @@ static void uncore_pmu_read(struct perf_event *event)
 }
 
 static struct pmu uncore_pmu = {
+	.task_ctx_nr	= perf_invalid_context, /* per-task uncore event is not allowed */
+
 	.event_init	= uncore_pmu_event_init,
 	.add		= uncore_pmu_add,
 	.del		= uncore_pmu_del,



^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-02  5:20 [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu Lin Ming
  2010-12-02  5:57 ` Lin Ming
@ 2010-12-07  6:15 ` Lin Ming
  2010-12-09 19:17   ` Peter Zijlstra
  2010-12-09 19:21 ` Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 26+ messages in thread
From: Lin Ming @ 2010-12-07  6:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> +
> +static int
> +uncore_perf_event_set_period(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	s64 left = local64_read(&hwc->period_left);
> +	s64 period = hwc->sample_period;
> +	u64 max_period = (1ULL << UNCORE_CNTVAL_BITS) - 1;
> +	int ret = 0, idx = hwc->idx;
> +
> +	/*
> +	 * If we are way outside a reasonable range then just skip forward:
> +	 */
> +	if (unlikely(left <= -period)) {
> +		left = period;
> +		local64_set(&hwc->period_left, left);
> +		hwc->last_period = period;
> +		ret = 1;
> +	}
> +
> +	if (unlikely(left <= 0)) {
> +		left += period;
> +		local64_set(&hwc->period_left, left);
> +		hwc->last_period = period;
> +		ret = 1;
> +	}
> +
> +	if (left > max_period)
> +		left = max_period;
> +
> +	/*
> +	 * The hw event starts counting from this event offset,
> +	 * mark it to be able to extra future deltas:
> +	 */
> +	local64_set(&hwc->prev_count, (u64)-left);

All uncore pmu interrupts from a socket are routed to one of the four
cores, so local64_set seems not correct here.

But hwc->prev_count is defined as local64_t, any idea how to set it
correctly?

Or is it OK if local64_set is always executed in the same cpu?




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-07  6:15 ` Lin Ming
@ 2010-12-09 19:17   ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2010-12-09 19:17 UTC (permalink / raw)
  To: Lin Ming
  Cc: Stephane Eranian, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Tue, 2010-12-07 at 14:15 +0800, Lin Ming wrote:
> On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> > +
> > +static int
> > +uncore_perf_event_set_period(struct perf_event *event)
> > +{
> > +	struct hw_perf_event *hwc = &event->hw;
> > +	s64 left = local64_read(&hwc->period_left);
> > +	s64 period = hwc->sample_period;
> > +	u64 max_period = (1ULL << UNCORE_CNTVAL_BITS) - 1;
> > +	int ret = 0, idx = hwc->idx;
> > +
> > +	/*
> > +	 * If we are way outside a reasonable range then just skip forward:
> > +	 */
> > +	if (unlikely(left <= -period)) {
> > +		left = period;
> > +		local64_set(&hwc->period_left, left);
> > +		hwc->last_period = period;
> > +		ret = 1;
> > +	}
> > +
> > +	if (unlikely(left <= 0)) {
> > +		left += period;
> > +		local64_set(&hwc->period_left, left);
> > +		hwc->last_period = period;
> > +		ret = 1;
> > +	}
> > +
> > +	if (left > max_period)
> > +		left = max_period;
> > +
> > +	/*
> > +	 * The hw event starts counting from this event offset,
> > +	 * mark it to be able to extra future deltas:
> > +	 */
> > +	local64_set(&hwc->prev_count, (u64)-left);
> 
> All uncore pmu interrupts from a socket are routed to one of the four
> cores, so local64_set seems not correct here.
> 
> But hwc->prev_count is defined as local64_t, any idea how to set it
> correctly?
> 
> Or is it OK if local64_set is always executed in the same cpu?

Yes, the local_t bits work as expected when its always accessed by the
same cpu.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-02  5:20 [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu Lin Ming
  2010-12-02  5:57 ` Lin Ming
  2010-12-07  6:15 ` Lin Ming
@ 2010-12-09 19:21 ` Peter Zijlstra
  2010-12-09 20:15   ` Stephane Eranian
                     ` (2 more replies)
  2010-12-09 19:24 ` Peter Zijlstra
  2011-01-13 17:14 ` Stephane Eranian
  4 siblings, 3 replies; 26+ messages in thread
From: Peter Zijlstra @ 2010-12-09 19:21 UTC (permalink / raw)
  To: Lin Ming
  Cc: Stephane Eranian, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> +       /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
> +       family = boot_cpu_data.x86;
> +       model = boot_cpu_data.x86_model;
> +       if (family != 6 || (model != 0x1A && model != 0x1E && model != 0x1F))
> +               return; 

So that's 26, 30 and 31? Curiously
arch/x86/kernel/cpu/perf_event_intel.c does have 31.

Also, could you match the style from perf_event_intel.c?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-02  5:20 [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu Lin Ming
                   ` (2 preceding siblings ...)
  2010-12-09 19:21 ` Peter Zijlstra
@ 2010-12-09 19:24 ` Peter Zijlstra
  2010-12-10  8:28   ` Lin Ming
  2011-01-13 17:14 ` Stephane Eranian
  4 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2010-12-09 19:24 UTC (permalink / raw)
  To: Lin Ming
  Cc: Stephane Eranian, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> +++ b/arch/x86/kernel/cpu/perf_event.c

> @@ -1362,6 +1357,7 @@ int __init init_hw_perf_events(void)
>  
>         switch (boot_cpu_data.x86_vendor) {
>         case X86_VENDOR_INTEL:
> +               init_uncore_pmu();
>                 err = intel_pmu_init();
>                 break;
>         case X86_VENDOR_AMD: 

Since you're otherwise fully separated from the regular cpu driver,
could you also grow your own early_initcall() ?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 19:21 ` Peter Zijlstra
@ 2010-12-09 20:15   ` Stephane Eranian
  2010-12-09 20:19     ` Peter Zijlstra
  2010-12-09 23:46   ` Stephane Eranian
  2010-12-10  8:28   ` Lin Ming
  2 siblings, 1 reply; 26+ messages in thread
From: Stephane Eranian @ 2010-12-09 20:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lin Ming, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Thu, Dec 9, 2010 at 8:21 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
>> +       /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
>> +       family = boot_cpu_data.x86;
>> +       model = boot_cpu_data.x86_model;
>> +       if (family != 6 || (model != 0x1A && model != 0x1E && model != 0x1F))
>> +               return;
>
> So that's 26, 30 and 31? Curiously
> arch/x86/kernel/cpu/perf_event_intel.c does have 31.
>
It is also missing model 44 (0x2c).

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 20:15   ` Stephane Eranian
@ 2010-12-09 20:19     ` Peter Zijlstra
  2010-12-09 20:27       ` Stephane Eranian
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2010-12-09 20:19 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Lin Ming, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Thu, 2010-12-09 at 21:15 +0100, Stephane Eranian wrote:
> On Thu, Dec 9, 2010 at 8:21 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> >> +       /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
> >> +       family = boot_cpu_data.x86;
> >> +       model = boot_cpu_data.x86_model;
> >> +       if (family != 6 || (model != 0x1A && model != 0x1E && model != 0x1F))
> >> +               return;
> >
> > So that's 26, 30 and 31? Curiously
> > arch/x86/kernel/cpu/perf_event_intel.c does have 31.

That was clearly meant to say: doesn't.. Does Intel have an exhaustive
model list somewhere?

> It is also missing model 44 (0x2c).

Right.. but if the westmere uncore is the same, then its also missing
37.

The -EX chips have a different uncore, right?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 20:19     ` Peter Zijlstra
@ 2010-12-09 20:27       ` Stephane Eranian
  0 siblings, 0 replies; 26+ messages in thread
From: Stephane Eranian @ 2010-12-09 20:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lin Ming, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Thu, Dec 9, 2010 at 9:19 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Thu, 2010-12-09 at 21:15 +0100, Stephane Eranian wrote:
>> On Thu, Dec 9, 2010 at 8:21 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>> > On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
>> >> +       /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
>> >> +       family = boot_cpu_data.x86;
>> >> +       model = boot_cpu_data.x86_model;
>> >> +       if (family != 6 || (model != 0x1A && model != 0x1E && model != 0x1F))
>> >> +               return;
>> >
>> > So that's 26, 30 and 31? Curiously
>> > arch/x86/kernel/cpu/perf_event_intel.c does have 31.
>
> That was clearly meant to say: doesn't.. Does Intel have an exhaustive
> model list somewhere?
>
>> It is also missing model 44 (0x2c).
>
Uncore logic in the same on NHM and WSM.
Events may be different but I have not see a
public document that describes that.

> Right.. but if the westmere uncore is the same, then its also missing
> 37.
>
yes, 37 is also missing.

> The -EX chips have a different uncore, right?
>
True. I don't know if the programming is completely different
but it would not surprise me.
Documentation is at:
http://www.intel.com/assets/en_US/pdf/designguide/323535.pdf

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 19:21 ` Peter Zijlstra
  2010-12-09 20:15   ` Stephane Eranian
@ 2010-12-09 23:46   ` Stephane Eranian
  2010-12-10  8:31     ` Lin Ming
  2010-12-10 10:47     ` Peter Zijlstra
  2010-12-10  8:28   ` Lin Ming
  2 siblings, 2 replies; 26+ messages in thread
From: Stephane Eranian @ 2010-12-09 23:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lin Ming, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

Hi,

So I have tested this patch a bit on WSM and as I expected there
are issues with sampling.

When HT is on, both siblings CPUs get the interrupt. The HW does not
allow you to only point interrupts to a single HT thread (CPU).

I did verify that indeed both threads get the interrupt and that you have a
race condition. Both sibling CPUs stop uncore, get the status. They may get
the same overflow status. Both will pass the uncore->active_mask because
it's shared among siblings cores. Thus,  you have a race for the whole
interrupt handler execution.

You need some serialization in there. But the patch does not address this.
The problem is different from the back-to-back interrupt issue that
Don worked on.
The per-cpu marked/handled trick cannot work to avoid this problem.

You cannot simply say "the lowest indexed" CPU of a sibling pair
handles the interrupt
because you don't know if this in an uncore intr, core interrupt or
something else. You
need to check. That means each HT thread needs to check uncore
ovfl_status. IF the
status is zero, then return. Otherwise, you need to do a 2nd level
check before you can
execute the handler. You need to know if the sibling CPU has already
"consumed" that
interrupt.

I think you need some sort of generation counter per physical core and
per HT thread.
On interrupt, you could do something along the line of:
      if (mycpu->intr_count == mysibling->intr_count) {
          then mycpu->intr_count++
          execute intr_handler()
      } else {
          mycpu->intr_count++
          return;
      }
Of course, the above needs some atomicity and ad locking (but I don't
think you can
use locks in NMI context).

This makes me wonder if vectoring uncore to NMI is really needed,
given you cannot
correlated to an IP, incl. a kernel IP. If we were to vector to a
dedicated (lower prio)
vector, then we could use the trick of saying the lowest indexed CPU in a pair
handles the interrupt (because we would already know this is an uncore
interrupt).
This would be much simpler. Price: not samples in kernel's critical
section. But those
are useless anyway with uncore events.

- uncore_get_status().
  PERF_GLOBAL_STATUS contains more than overflow
  status, bit 61-63 need to be masked off.

- uncore_pmu_cpu_prepare()
  I don't understand the x86_max_cores < 2 test. If you run your
  NHM/WSM is single core mode, you still have uncore to deal with
  thus, you need cpuc->intel_uncore initialized, don't you?

- I assume that the reason the uncore->refcnt management is not
  using atomic ops because the whole CPU hotplug is serialized, right?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 19:21 ` Peter Zijlstra
  2010-12-09 20:15   ` Stephane Eranian
  2010-12-09 23:46   ` Stephane Eranian
@ 2010-12-10  8:28   ` Lin Ming
  2 siblings, 0 replies; 26+ messages in thread
From: Lin Ming @ 2010-12-10  8:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Fri, 2010-12-10 at 03:21 +0800, Peter Zijlstra wrote:
> On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> > +       /* Check CPUID signatures: 06_1AH, 06_1EH, 06_1FH */
> > +       family = boot_cpu_data.x86;
> > +       model = boot_cpu_data.x86_model;
> > +       if (family != 6 || (model != 0x1A && model != 0x1E && model != 0x1F))
> > +               return; 
> 
> So that's 26, 30 and 31? Curiously
> arch/x86/kernel/cpu/perf_event_intel.c does have 31.
> 
> Also, could you match the style from perf_event_intel.c?

OK.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 19:24 ` Peter Zijlstra
@ 2010-12-10  8:28   ` Lin Ming
  0 siblings, 0 replies; 26+ messages in thread
From: Lin Ming @ 2010-12-10  8:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Fri, 2010-12-10 at 03:24 +0800, Peter Zijlstra wrote:
> On Thu, 2010-12-02 at 13:20 +0800, Lin Ming wrote:
> > +++ b/arch/x86/kernel/cpu/perf_event.c
> 
> > @@ -1362,6 +1357,7 @@ int __init init_hw_perf_events(void)
> >  
> >         switch (boot_cpu_data.x86_vendor) {
> >         case X86_VENDOR_INTEL:
> > +               init_uncore_pmu();
> >                 err = intel_pmu_init();
> >                 break;
> >         case X86_VENDOR_AMD: 
> 
> Since you're otherwise fully separated from the regular cpu driver,
> could you also grow your own early_initcall() ?

OK, will use early_initcall().



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 23:46   ` Stephane Eranian
@ 2010-12-10  8:31     ` Lin Ming
  2010-12-10 10:47     ` Peter Zijlstra
  1 sibling, 0 replies; 26+ messages in thread
From: Lin Ming @ 2010-12-10  8:31 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

Thanks for your great comments.

Let me read it carefully, and then reply back.

Lin Ming

On Fri, 2010-12-10 at 07:46 +0800, Stephane Eranian wrote:
> Hi,
> 
> So I have tested this patch a bit on WSM and as I expected there
> are issues with sampling.
> 
> When HT is on, both siblings CPUs get the interrupt. The HW does not
> allow you to only point interrupts to a single HT thread (CPU).
> 
> I did verify that indeed both threads get the interrupt and that you have a
> race condition. Both sibling CPUs stop uncore, get the status. They may get
> the same overflow status. Both will pass the uncore->active_mask because
> it's shared among siblings cores. Thus,  you have a race for the whole
> interrupt handler execution.
> 
> You need some serialization in there. But the patch does not address this.
> The problem is different from the back-to-back interrupt issue that
> Don worked on.
> The per-cpu marked/handled trick cannot work to avoid this problem.
> 
> You cannot simply say "the lowest indexed" CPU of a sibling pair
> handles the interrupt
> because you don't know if this in an uncore intr, core interrupt or
> something else. You
> need to check. That means each HT thread needs to check uncore
> ovfl_status. IF the
> status is zero, then return. Otherwise, you need to do a 2nd level
> check before you can
> execute the handler. You need to know if the sibling CPU has already
> "consumed" that
> interrupt.
> 
> I think you need some sort of generation counter per physical core and
> per HT thread.
> On interrupt, you could do something along the line of:
>       if (mycpu->intr_count == mysibling->intr_count) {
>           then mycpu->intr_count++
>           execute intr_handler()
>       } else {
>           mycpu->intr_count++
>           return;
>       }
> Of course, the above needs some atomicity and ad locking (but I don't
> think you can
> use locks in NMI context).
> 
> This makes me wonder if vectoring uncore to NMI is really needed,
> given you cannot
> correlated to an IP, incl. a kernel IP. If we were to vector to a
> dedicated (lower prio)
> vector, then we could use the trick of saying the lowest indexed CPU in a pair
> handles the interrupt (because we would already know this is an uncore
> interrupt).
> This would be much simpler. Price: not samples in kernel's critical
> section. But those
> are useless anyway with uncore events.
> 
> - uncore_get_status().
>   PERF_GLOBAL_STATUS contains more than overflow
>   status, bit 61-63 need to be masked off.
> 
> - uncore_pmu_cpu_prepare()
>   I don't understand the x86_max_cores < 2 test. If you run your
>   NHM/WSM is single core mode, you still have uncore to deal with
>   thus, you need cpuc->intel_uncore initialized, don't you?
> 
> - I assume that the reason the uncore->refcnt management is not
>   using atomic ops because the whole CPU hotplug is serialized, right?



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-09 23:46   ` Stephane Eranian
  2010-12-10  8:31     ` Lin Ming
@ 2010-12-10 10:47     ` Peter Zijlstra
  2010-12-10 10:52       ` Peter Zijlstra
  2010-12-11  5:49       ` Stephane Eranian
  1 sibling, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2010-12-10 10:47 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Lin Ming, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Fri, 2010-12-10 at 00:46 +0100, Stephane Eranian wrote:
> Hi,
> 
> So I have tested this patch a bit on WSM and as I expected there
> are issues with sampling.
> 
> When HT is on, both siblings CPUs get the interrupt. The HW does not
> allow you to only point interrupts to a single HT thread (CPU).

Egads, how ugly :/

> I did verify that indeed both threads get the interrupt and that you have a
> race condition. Both sibling CPUs stop uncore, get the status. They may get
> the same overflow status. Both will pass the uncore->active_mask because
> it's shared among siblings cores. Thus,  you have a race for the whole
> interrupt handler execution.
> 
> You need some serialization in there. But the patch does not address this.
> The problem is different from the back-to-back interrupt issue that
> Don worked on.
> The per-cpu marked/handled trick cannot work to avoid this problem.
> 
> You cannot simply say "the lowest indexed" CPU of a sibling pair
> handles the interrupt
> because you don't know if this in an uncore intr, core interrupt or
> something else. You
> need to check. That means each HT thread needs to check uncore
> ovfl_status. IF the
> status is zero, then return. Otherwise, you need to do a 2nd level
> check before you can
> execute the handler. You need to know if the sibling CPU has already
> "consumed" that
> interrupt.
> 
> I think you need some sort of generation counter per physical core and
> per HT thread.
> On interrupt, you could do something along the line of:
>       if (mycpu->intr_count == mysibling->intr_count) {
>           then mycpu->intr_count++
>           execute intr_handler()
>       } else {
>           mycpu->intr_count++
>           return;
>       }
> Of course, the above needs some atomicity and ad locking 

Does that guarantee that the same sibling handles all interrupts? Since
a lot of the infrastructure uses local*_t we're not good with cross-cpu
stuff.

Damn what a mess.. we need to serialize enough for both cpus to at least
see the overflow bit.. maybe something like:


struct intel_percore {
   ...
   atomic_t uncore_barrier;
};

void uncore_barrier(void)
{
	struct intel_percore *percore = this_cpu_ptr(cpu_hw_events)->percore;
	int armed;

	armed = atomic_cmpxchg(&percore->uncore_barrier, 0, 1) == 0;
	if (armed) {
		/* we armed, it, now wait for completion */
		while (atomic_read(&percore->uncore_barrier))
			cpu_relax();
	} else {
		/* our sibling must have, decrement it */
		if (atomic_cmpxchg(&percore->uncore_barrier, 1, 0) != 1)
			BUG();
	}
}

Then have something like:

handle_uncore_interrupt()
{
	u64 overflow = rdmsrl(MSR_UNCORE_PERF_GLOBAL_OVF_STATUS);
	int cpu;

	if (!overflow)
		return 0; /* not our interrupt to handle */

	uncore_barrier(); /* wait so our sibling will also observe the overflow */

	cpu = smp_processor_id();
	if (cpu != cpumask_first(topology_thread_cpumask(cpu)))
		return 1; /* our sibling will handle it, eat the NMI */

	/* OK, we've got an overflow and we're the first CPU in the thread mask */

	... do fancy stuff ...

	return 1; /* we handled it, eat the NMI */
}


> (but I don't think you can use locks in NMI context).

You can, as long as they're never used from !NMI, its icky, but it
works.

> This makes me wonder if vectoring uncore to NMI is really needed,
> given you cannot
> correlated to an IP, incl. a kernel IP. If we were to vector to a
> dedicated (lower prio)
> vector, then we could use the trick of saying the lowest indexed CPU in a pair
> handles the interrupt (because we would already know this is an uncore
> interrupt).
> This would be much simpler. Price: not samples in kernel's critical
> section. But those
> are useless anyway with uncore events.

But the uncore uses the same PMI line, right? You cannot point the
uncore to another vector. /me goes find the docs -- ok, its too early in
the morning to clearly parse that...

Besides, people asked for the sampling thing didn't they (also we need
it to fold the count to avoid overflow on 48bit). Also the PAPI people
even want per-task uncore counters because that's the only thing PAPI
can do.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-10 10:47     ` Peter Zijlstra
@ 2010-12-10 10:52       ` Peter Zijlstra
  2010-12-10 15:11         ` Cyrill Gorcunov
  2010-12-11  5:49       ` Stephane Eranian
  1 sibling, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2010-12-10 10:52 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Lin Ming, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml, Cyrill Gorcunov

On Fri, 2010-12-10 at 11:47 +0100, Peter Zijlstra wrote:
> On Fri, 2010-12-10 at 00:46 +0100, Stephane Eranian wrote:
> > Hi,
> > 
> > So I have tested this patch a bit on WSM and as I expected there
> > are issues with sampling.
> > 
> > When HT is on, both siblings CPUs get the interrupt. The HW does not
> > allow you to only point interrupts to a single HT thread (CPU).
> 
> Egads, how ugly :/
> 
> > I did verify that indeed both threads get the interrupt and that you have a
> > race condition. Both sibling CPUs stop uncore, get the status. They may get
> > the same overflow status. Both will pass the uncore->active_mask because
> > it's shared among siblings cores. Thus,  you have a race for the whole
> > interrupt handler execution.
> > 
> > You need some serialization in there. But the patch does not address this.
> > The problem is different from the back-to-back interrupt issue that
> > Don worked on.
> > The per-cpu marked/handled trick cannot work to avoid this problem.
> > 
> > You cannot simply say "the lowest indexed" CPU of a sibling pair
> > handles the interrupt
> > because you don't know if this in an uncore intr, core interrupt or
> > something else. You
> > need to check. That means each HT thread needs to check uncore
> > ovfl_status. IF the
> > status is zero, then return. Otherwise, you need to do a 2nd level
> > check before you can
> > execute the handler. You need to know if the sibling CPU has already
> > "consumed" that
> > interrupt.
> > 
> > I think you need some sort of generation counter per physical core and
> > per HT thread.
> > On interrupt, you could do something along the line of:
> >       if (mycpu->intr_count == mysibling->intr_count) {
> >           then mycpu->intr_count++
> >           execute intr_handler()
> >       } else {
> >           mycpu->intr_count++
> >           return;
> >       }
> > Of course, the above needs some atomicity and ad locking 
> 
> Does that guarantee that the same sibling handles all interrupts? Since
> a lot of the infrastructure uses local*_t we're not good with cross-cpu
> stuff.
> 
> Damn what a mess.. we need to serialize enough for both cpus to at least
> see the overflow bit.. maybe something like:
> 
> 
> struct intel_percore {
>    ...
>    atomic_t uncore_barrier;
> };
> 
> void uncore_barrier(void)
> {
> 	struct intel_percore *percore = this_cpu_ptr(cpu_hw_events)->percore;
> 	int armed;
> 
> 	armed = atomic_cmpxchg(&percore->uncore_barrier, 0, 1) == 0;
> 	if (armed) {
> 		/* we armed, it, now wait for completion */
> 		while (atomic_read(&percore->uncore_barrier))
> 			cpu_relax();
> 	} else {
> 		/* our sibling must have, decrement it */
> 		if (atomic_cmpxchg(&percore->uncore_barrier, 1, 0) != 1)
> 			BUG();
> 	}
> }
> 
> Then have something like:
> 
> handle_uncore_interrupt()
> {
> 	u64 overflow = rdmsrl(MSR_UNCORE_PERF_GLOBAL_OVF_STATUS);
> 	int cpu;
> 
> 	if (!overflow)
> 		return 0; /* not our interrupt to handle */
> 
> 	uncore_barrier(); /* wait so our sibling will also observe the overflow */
> 
> 	cpu = smp_processor_id();
> 	if (cpu != cpumask_first(topology_thread_cpumask(cpu)))
> 		return 1; /* our sibling will handle it, eat the NMI */
> 
> 	/* OK, we've got an overflow and we're the first CPU in the thread mask */
> 
> 	... do fancy stuff ...
> 
> 	return 1; /* we handled it, eat the NMI */
> }

That would of course need to also grow some smarts to detect if there is
only 1 sibling online.

CC'ed Cyrill as P4 might have something similar.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-10 10:52       ` Peter Zijlstra
@ 2010-12-10 15:11         ` Cyrill Gorcunov
  0 siblings, 0 replies; 26+ messages in thread
From: Cyrill Gorcunov @ 2010-12-10 15:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Stephane Eranian, Lin Ming, Andi Kleen, Ingo Molnar,
	Frederic Weisbecker, Arjan van de Ven, lkml

On Fri, Dec 10, 2010 at 11:52:15AM +0100, Peter Zijlstra wrote:
> On Fri, 2010-12-10 at 11:47 +0100, Peter Zijlstra wrote:
> > On Fri, 2010-12-10 at 00:46 +0100, Stephane Eranian wrote:
> > > Hi,
> > > 
> > > So I have tested this patch a bit on WSM and as I expected there
> > > are issues with sampling.
> > > 
> > > When HT is on, both siblings CPUs get the interrupt. The HW does not
> > > allow you to only point interrupts to a single HT thread (CPU).
> > 
> > Egads, how ugly :/
> >

crap...
...
 
> 
> That would of course need to also grow some smarts to detect if there is
> only 1 sibling online.
> 
> CC'ed Cyrill as P4 might have something similar.
> 

Yes, Don was testing p4 stuff for me and have reported this issue as well.
I'll check what I can do. Thanks for CC'ing me.

  Cyrill

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-10 10:47     ` Peter Zijlstra
  2010-12-10 10:52       ` Peter Zijlstra
@ 2010-12-11  5:49       ` Stephane Eranian
  2010-12-13  8:27         ` Lin Ming
  1 sibling, 1 reply; 26+ messages in thread
From: Stephane Eranian @ 2010-12-11  5:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lin Ming, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

Hi,

Ok, so I have an explanation for what we are seeing. In fact, what
bothered me in all
of this is that I did not recall ever running into this problem of
double-interrupt with HT
when I implemented the perfmon support for uncore sampling. The reason
is in fact
real simple.

You are getting interrupts on both threads because you have enabled uncore PMU
on all CPUs, in uncore_cpu_starting():
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);

You need to do that only on ONE of the two siblings. In fact, you want
that only ONCE
per socket. You can do this on the first CPU to  use the uncore to add
something a bit
more dynamic and to make sure you have some control over where the
overhead is applied.
Once you do that, only one CPU/socket will get the interrupt and all
will be fine.



On Fri, Dec 10, 2010 at 11:47 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Fri, 2010-12-10 at 00:46 +0100, Stephane Eranian wrote:
>> Hi,
>>
>> So I have tested this patch a bit on WSM and as I expected there
>> are issues with sampling.
>>
>> When HT is on, both siblings CPUs get the interrupt. The HW does not
>> allow you to only point interrupts to a single HT thread (CPU).
>
> Egads, how ugly :/
>
>> I did verify that indeed both threads get the interrupt and that you have a
>> race condition. Both sibling CPUs stop uncore, get the status. They may get
>> the same overflow status. Both will pass the uncore->active_mask because
>> it's shared among siblings cores. Thus,  you have a race for the whole
>> interrupt handler execution.
>>
>> You need some serialization in there. But the patch does not address this.
>> The problem is different from the back-to-back interrupt issue that
>> Don worked on.
>> The per-cpu marked/handled trick cannot work to avoid this problem.
>>
>> You cannot simply say "the lowest indexed" CPU of a sibling pair
>> handles the interrupt
>> because you don't know if this in an uncore intr, core interrupt or
>> something else. You
>> need to check. That means each HT thread needs to check uncore
>> ovfl_status. IF the
>> status is zero, then return. Otherwise, you need to do a 2nd level
>> check before you can
>> execute the handler. You need to know if the sibling CPU has already
>> "consumed" that
>> interrupt.
>>
>> I think you need some sort of generation counter per physical core and
>> per HT thread.
>> On interrupt, you could do something along the line of:
>>       if (mycpu->intr_count == mysibling->intr_count) {
>>           then mycpu->intr_count++
>>           execute intr_handler()
>>       } else {
>>           mycpu->intr_count++
>>           return;
>>       }
>> Of course, the above needs some atomicity and ad locking
>
> Does that guarantee that the same sibling handles all interrupts? Since
> a lot of the infrastructure uses local*_t we're not good with cross-cpu
> stuff.
>
> Damn what a mess.. we need to serialize enough for both cpus to at least
> see the overflow bit.. maybe something like:
>
>
> struct intel_percore {
>   ...
>   atomic_t uncore_barrier;
> };
>
> void uncore_barrier(void)
> {
>        struct intel_percore *percore = this_cpu_ptr(cpu_hw_events)->percore;
>        int armed;
>
>        armed = atomic_cmpxchg(&percore->uncore_barrier, 0, 1) == 0;
>        if (armed) {
>                /* we armed, it, now wait for completion */
>                while (atomic_read(&percore->uncore_barrier))
>                        cpu_relax();
>        } else {
>                /* our sibling must have, decrement it */
>                if (atomic_cmpxchg(&percore->uncore_barrier, 1, 0) != 1)
>                        BUG();
>        }
> }
>
> Then have something like:
>
> handle_uncore_interrupt()
> {
>        u64 overflow = rdmsrl(MSR_UNCORE_PERF_GLOBAL_OVF_STATUS);
>        int cpu;
>
>        if (!overflow)
>                return 0; /* not our interrupt to handle */
>
>        uncore_barrier(); /* wait so our sibling will also observe the overflow */
>
>        cpu = smp_processor_id();
>        if (cpu != cpumask_first(topology_thread_cpumask(cpu)))
>                return 1; /* our sibling will handle it, eat the NMI */
>
>        /* OK, we've got an overflow and we're the first CPU in the thread mask */
>
>        ... do fancy stuff ...
>
>        return 1; /* we handled it, eat the NMI */
> }
>
>
>> (but I don't think you can use locks in NMI context).
>
> You can, as long as they're never used from !NMI, its icky, but it
> works.
>
>> This makes me wonder if vectoring uncore to NMI is really needed,
>> given you cannot
>> correlated to an IP, incl. a kernel IP. If we were to vector to a
>> dedicated (lower prio)
>> vector, then we could use the trick of saying the lowest indexed CPU in a pair
>> handles the interrupt (because we would already know this is an uncore
>> interrupt).
>> This would be much simpler. Price: not samples in kernel's critical
>> section. But those
>> are useless anyway with uncore events.
>
> But the uncore uses the same PMI line, right? You cannot point the
> uncore to another vector. /me goes find the docs -- ok, its too early in
> the morning to clearly parse that...
>
> Besides, people asked for the sampling thing didn't they (also we need
> it to fold the count to avoid overflow on 48bit). Also the PAPI people
> even want per-task uncore counters because that's the only thing PAPI
> can do.
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-11  5:49       ` Stephane Eranian
@ 2010-12-13  8:27         ` Lin Ming
  2010-12-13 16:42           ` Stephane Eranian
  0 siblings, 1 reply; 26+ messages in thread
From: Lin Ming @ 2010-12-13  8:27 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Sat, 2010-12-11 at 13:49 +0800, Stephane Eranian wrote:
> Hi,
> 
> Ok, so I have an explanation for what we are seeing. In fact, what
> bothered me in all
> of this is that I did not recall ever running into this problem of
> double-interrupt with HT
> when I implemented the perfmon support for uncore sampling. The reason
> is in fact
> real simple.
> 
> You are getting interrupts on both threads because you have enabled uncore PMU
> on all CPUs, in uncore_cpu_starting():
> +       rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
> +       wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);

Yeah! Thanks for the catch.

> 
> You need to do that only on ONE of the two siblings. In fact, you want
> that only ONCE
> per socket. You can do this on the first CPU to  use the uncore to add
> something a bit
> more dynamic and to make sure you have some control over where the
> overhead is applied.
> Once you do that, only one CPU/socket will get the interrupt and all
> will be fine.

I'll update the patches.

Thanks,
Lin Ming

> 
> 
> 
> On Fri, Dec 10, 2010 at 11:47 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > On Fri, 2010-12-10 at 00:46 +0100, Stephane Eranian wrote:
> >> Hi,
> >>
> >> So I have tested this patch a bit on WSM and as I expected there
> >> are issues with sampling.
> >>
> >> When HT is on, both siblings CPUs get the interrupt. The HW does not
> >> allow you to only point interrupts to a single HT thread (CPU).
> >
> > Egads, how ugly :/
> >
> >> I did verify that indeed both threads get the interrupt and that you have a
> >> race condition. Both sibling CPUs stop uncore, get the status. They may get
> >> the same overflow status. Both will pass the uncore->active_mask because
> >> it's shared among siblings cores. Thus,  you have a race for the whole
> >> interrupt handler execution.
> >>
> >> You need some serialization in there. But the patch does not address this.
> >> The problem is different from the back-to-back interrupt issue that
> >> Don worked on.
> >> The per-cpu marked/handled trick cannot work to avoid this problem.
> >>
> >> You cannot simply say "the lowest indexed" CPU of a sibling pair
> >> handles the interrupt
> >> because you don't know if this in an uncore intr, core interrupt or
> >> something else. You
> >> need to check. That means each HT thread needs to check uncore
> >> ovfl_status. IF the
> >> status is zero, then return. Otherwise, you need to do a 2nd level
> >> check before you can
> >> execute the handler. You need to know if the sibling CPU has already
> >> "consumed" that
> >> interrupt.
> >>
> >> I think you need some sort of generation counter per physical core and
> >> per HT thread.
> >> On interrupt, you could do something along the line of:
> >>       if (mycpu->intr_count == mysibling->intr_count) {
> >>           then mycpu->intr_count++
> >>           execute intr_handler()
> >>       } else {
> >>           mycpu->intr_count++
> >>           return;
> >>       }
> >> Of course, the above needs some atomicity and ad locking
> >
> > Does that guarantee that the same sibling handles all interrupts? Since
> > a lot of the infrastructure uses local*_t we're not good with cross-cpu
> > stuff.
> >
> > Damn what a mess.. we need to serialize enough for both cpus to at least
> > see the overflow bit.. maybe something like:
> >
> >
> > struct intel_percore {
> >   ...
> >   atomic_t uncore_barrier;
> > };
> >
> > void uncore_barrier(void)
> > {
> >        struct intel_percore *percore = this_cpu_ptr(cpu_hw_events)->percore;
> >        int armed;
> >
> >        armed = atomic_cmpxchg(&percore->uncore_barrier, 0, 1) == 0;
> >        if (armed) {
> >                /* we armed, it, now wait for completion */
> >                while (atomic_read(&percore->uncore_barrier))
> >                        cpu_relax();
> >        } else {
> >                /* our sibling must have, decrement it */
> >                if (atomic_cmpxchg(&percore->uncore_barrier, 1, 0) != 1)
> >                        BUG();
> >        }
> > }
> >
> > Then have something like:
> >
> > handle_uncore_interrupt()
> > {
> >        u64 overflow = rdmsrl(MSR_UNCORE_PERF_GLOBAL_OVF_STATUS);
> >        int cpu;
> >
> >        if (!overflow)
> >                return 0; /* not our interrupt to handle */
> >
> >        uncore_barrier(); /* wait so our sibling will also observe the overflow */
> >
> >        cpu = smp_processor_id();
> >        if (cpu != cpumask_first(topology_thread_cpumask(cpu)))
> >                return 1; /* our sibling will handle it, eat the NMI */
> >
> >        /* OK, we've got an overflow and we're the first CPU in the thread mask */
> >
> >        ... do fancy stuff ...
> >
> >        return 1; /* we handled it, eat the NMI */
> > }
> >
> >
> >> (but I don't think you can use locks in NMI context).
> >
> > You can, as long as they're never used from !NMI, its icky, but it
> > works.
> >
> >> This makes me wonder if vectoring uncore to NMI is really needed,
> >> given you cannot
> >> correlated to an IP, incl. a kernel IP. If we were to vector to a
> >> dedicated (lower prio)
> >> vector, then we could use the trick of saying the lowest indexed CPU in a pair
> >> handles the interrupt (because we would already know this is an uncore
> >> interrupt).
> >> This would be much simpler. Price: not samples in kernel's critical
> >> section. But those
> >> are useless anyway with uncore events.
> >
> > But the uncore uses the same PMI line, right? You cannot point the
> > uncore to another vector. /me goes find the docs -- ok, its too early in
> > the morning to clearly parse that...
> >
> > Besides, people asked for the sampling thing didn't they (also we need
> > it to fold the count to avoid overflow on 48bit). Also the PAPI people
> > even want per-task uncore counters because that's the only thing PAPI
> > can do.
> >



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-13  8:27         ` Lin Ming
@ 2010-12-13 16:42           ` Stephane Eranian
  2010-12-13 16:51             ` Andi Kleen
  0 siblings, 1 reply; 26+ messages in thread
From: Stephane Eranian @ 2010-12-13 16:42 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Mon, Dec 13, 2010 at 9:27 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> On Sat, 2010-12-11 at 13:49 +0800, Stephane Eranian wrote:
>> Hi,
>>
>> Ok, so I have an explanation for what we are seeing. In fact, what
>> bothered me in all
>> of this is that I did not recall ever running into this problem of
>> double-interrupt with HT
>> when I implemented the perfmon support for uncore sampling. The reason
>> is in fact
>> real simple.
>>
>> You are getting interrupts on both threads because you have enabled uncore PMU
>> on all CPUs, in uncore_cpu_starting():
>> +       rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
>> +       wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);
>
> Yeah! Thanks for the catch.
>
If think there you want something like:

if (cpu == cpumask_first(topology_core_siblings(cpu)) {
      rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
     wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);
}
I have verified that with something like that in place, you get only
one interrupt.
But it seems there may be some initialization of counters missing somewhere
because running perf stat, I got bogus counts back with the uncore
fixed counter.

>>
>> You need to do that only on ONE of the two siblings. In fact, you want
>> that only ONCE
>> per socket. You can do this on the first CPU to  use the uncore to add
>> something a bit
>> more dynamic and to make sure you have some control over where the
>> overhead is applied.
>> Once you do that, only one CPU/socket will get the interrupt and all
>> will be fine.
>
> I'll update the patches.
>
> Thanks,
> Lin Ming
>
>>
>>
>>
>> On Fri, Dec 10, 2010 at 11:47 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>> > On Fri, 2010-12-10 at 00:46 +0100, Stephane Eranian wrote:
>> >> Hi,
>> >>
>> >> So I have tested this patch a bit on WSM and as I expected there
>> >> are issues with sampling.
>> >>
>> >> When HT is on, both siblings CPUs get the interrupt. The HW does not
>> >> allow you to only point interrupts to a single HT thread (CPU).
>> >
>> > Egads, how ugly :/
>> >
>> >> I did verify that indeed both threads get the interrupt and that you have a
>> >> race condition. Both sibling CPUs stop uncore, get the status. They may get
>> >> the same overflow status. Both will pass the uncore->active_mask because
>> >> it's shared among siblings cores. Thus,  you have a race for the whole
>> >> interrupt handler execution.
>> >>
>> >> You need some serialization in there. But the patch does not address this.
>> >> The problem is different from the back-to-back interrupt issue that
>> >> Don worked on.
>> >> The per-cpu marked/handled trick cannot work to avoid this problem.
>> >>
>> >> You cannot simply say "the lowest indexed" CPU of a sibling pair
>> >> handles the interrupt
>> >> because you don't know if this in an uncore intr, core interrupt or
>> >> something else. You
>> >> need to check. That means each HT thread needs to check uncore
>> >> ovfl_status. IF the
>> >> status is zero, then return. Otherwise, you need to do a 2nd level
>> >> check before you can
>> >> execute the handler. You need to know if the sibling CPU has already
>> >> "consumed" that
>> >> interrupt.
>> >>
>> >> I think you need some sort of generation counter per physical core and
>> >> per HT thread.
>> >> On interrupt, you could do something along the line of:
>> >>       if (mycpu->intr_count == mysibling->intr_count) {
>> >>           then mycpu->intr_count++
>> >>           execute intr_handler()
>> >>       } else {
>> >>           mycpu->intr_count++
>> >>           return;
>> >>       }
>> >> Of course, the above needs some atomicity and ad locking
>> >
>> > Does that guarantee that the same sibling handles all interrupts? Since
>> > a lot of the infrastructure uses local*_t we're not good with cross-cpu
>> > stuff.
>> >
>> > Damn what a mess.. we need to serialize enough for both cpus to at least
>> > see the overflow bit.. maybe something like:
>> >
>> >
>> > struct intel_percore {
>> >   ...
>> >   atomic_t uncore_barrier;
>> > };
>> >
>> > void uncore_barrier(void)
>> > {
>> >        struct intel_percore *percore = this_cpu_ptr(cpu_hw_events)->percore;
>> >        int armed;
>> >
>> >        armed = atomic_cmpxchg(&percore->uncore_barrier, 0, 1) == 0;
>> >        if (armed) {
>> >                /* we armed, it, now wait for completion */
>> >                while (atomic_read(&percore->uncore_barrier))
>> >                        cpu_relax();
>> >        } else {
>> >                /* our sibling must have, decrement it */
>> >                if (atomic_cmpxchg(&percore->uncore_barrier, 1, 0) != 1)
>> >                        BUG();
>> >        }
>> > }
>> >
>> > Then have something like:
>> >
>> > handle_uncore_interrupt()
>> > {
>> >        u64 overflow = rdmsrl(MSR_UNCORE_PERF_GLOBAL_OVF_STATUS);
>> >        int cpu;
>> >
>> >        if (!overflow)
>> >                return 0; /* not our interrupt to handle */
>> >
>> >        uncore_barrier(); /* wait so our sibling will also observe the overflow */
>> >
>> >        cpu = smp_processor_id();
>> >        if (cpu != cpumask_first(topology_thread_cpumask(cpu)))
>> >                return 1; /* our sibling will handle it, eat the NMI */
>> >
>> >        /* OK, we've got an overflow and we're the first CPU in the thread mask */
>> >
>> >        ... do fancy stuff ...
>> >
>> >        return 1; /* we handled it, eat the NMI */
>> > }
>> >
>> >
>> >> (but I don't think you can use locks in NMI context).
>> >
>> > You can, as long as they're never used from !NMI, its icky, but it
>> > works.
>> >
>> >> This makes me wonder if vectoring uncore to NMI is really needed,
>> >> given you cannot
>> >> correlated to an IP, incl. a kernel IP. If we were to vector to a
>> >> dedicated (lower prio)
>> >> vector, then we could use the trick of saying the lowest indexed CPU in a pair
>> >> handles the interrupt (because we would already know this is an uncore
>> >> interrupt).
>> >> This would be much simpler. Price: not samples in kernel's critical
>> >> section. But those
>> >> are useless anyway with uncore events.
>> >
>> > But the uncore uses the same PMI line, right? You cannot point the
>> > uncore to another vector. /me goes find the docs -- ok, its too early in
>> > the morning to clearly parse that...
>> >
>> > Besides, people asked for the sampling thing didn't they (also we need
>> > it to fold the count to avoid overflow on 48bit). Also the PAPI people
>> > even want per-task uncore counters because that's the only thing PAPI
>> > can do.
>> >
>
>
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-13 16:42           ` Stephane Eranian
@ 2010-12-13 16:51             ` Andi Kleen
  2010-12-13 19:04               ` Stephane Eranian
  0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2010-12-13 16:51 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Lin Ming, Peter Zijlstra, Andi Kleen, Ingo Molnar,
	Frederic Weisbecker, Arjan van de Ven, lkml

> If think there you want something like:
> 
> if (cpu == cpumask_first(topology_core_siblings(cpu)) {
>       rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
>      wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);
> }

You have to be careful with hotplug. The code would need to be rerun
on hotplug. Otherwise the interrupt may end up on a downed CPU.
-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-13 16:51             ` Andi Kleen
@ 2010-12-13 19:04               ` Stephane Eranian
  0 siblings, 0 replies; 26+ messages in thread
From: Stephane Eranian @ 2010-12-13 19:04 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Lin Ming, Peter Zijlstra, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Mon, Dec 13, 2010 at 5:51 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> If think there you want something like:
>>
>> if (cpu == cpumask_first(topology_core_siblings(cpu)) {
>>       rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
>>      wrmsrl(MSR_IA32_DEBUGCTLMSR, val | DEBUGCTLMSR_ENABLE_UNCORE_PMI);
>> }
>
> You have to be careful with hotplug. The code would need to be rerun
> on hotplug. Otherwise the interrupt may end up on a downed CPU.

That is true.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2010-12-02  5:20 [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu Lin Ming
                   ` (3 preceding siblings ...)
  2010-12-09 19:24 ` Peter Zijlstra
@ 2011-01-13 17:14 ` Stephane Eranian
  2011-01-17  1:29   ` Lin Ming
  4 siblings, 1 reply; 26+ messages in thread
From: Stephane Eranian @ 2011-01-13 17:14 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

Lin,

On Thu, Dec 2, 2010 at 6:20 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> +static void uncore_pmu_enable_all(int nmi_core)
> +{
> +       u64 ctrl;
> +
> +       ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
> +
> +       /* Route all interrupts to the first core that accesses uncore */
> +       ctrl |= 1ULL << (48 + nmi_core);
> +
> +       wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
> +}

Are you sure nmi_core is always between 0-3 on a 4-core system and 0-5
on a 6-core system?
In other words, is that what topology_core_id(raw_smp_processor_id()) returns?

Note that, unfortunately, I have not seen documentation that says on
6-core system
UNC_GLOBAL_CTRL has 6 interrupt target bits, but it would make sense.


Otherwise, you will get a kernel panic when you wrmsr UNC_GLOBAL_CTRL.

> +
> +       if (uncore->n_events == 1) {
> +               nmi_core = topology_core_id(raw_smp_processor_id());
> +               uncore->nmi_core = nmi_core;
> +               uncore_pmu_enable_all(nmi_core);
> +       }

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2011-01-13 17:14 ` Stephane Eranian
@ 2011-01-17  1:29   ` Lin Ming
  2011-01-17  8:44     ` Stephane Eranian
  0 siblings, 1 reply; 26+ messages in thread
From: Lin Ming @ 2011-01-17  1:29 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Fri, 2011-01-14 at 01:14 +0800, Stephane Eranian wrote:
> Lin,

Hi, Stephane, 

Sorry for late response, I'm just back from vacation.

> 
> On Thu, Dec 2, 2010 at 6:20 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> > +static void uncore_pmu_enable_all(int nmi_core)
> > +{
> > +       u64 ctrl;
> > +
> > +       ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
> > +
> > +       /* Route all interrupts to the first core that accesses uncore */
> > +       ctrl |= 1ULL << (48 + nmi_core);
> > +
> > +       wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
> > +}
> 
> Are you sure nmi_core is always between 0-3 on a 4-core system and 0-5
> on a 6-core system?
> In other words, is that what topology_core_id(raw_smp_processor_id()) returns?

I just have a look at a 6-core system, the core id is not 0-5

$ cat /proc/cpuinfo |grep "core id"
core id		: 0
core id		: 1
core id		: 2
core id		: 8
core id		: 9
core id		: 10

So we'd better route all the interrupts to the first core of the socket.

Thanks for the catch.
Lin Ming

> 
> Note that, unfortunately, I have not seen documentation that says on
> 6-core system
> UNC_GLOBAL_CTRL has 6 interrupt target bits, but it would make sense.
> 
> 
> Otherwise, you will get a kernel panic when you wrmsr UNC_GLOBAL_CTRL.
> 
> > +
> > +       if (uncore->n_events == 1) {
> > +               nmi_core = topology_core_id(raw_smp_processor_id());
> > +               uncore->nmi_core = nmi_core;
> > +               uncore_pmu_enable_all(nmi_core);
> > +       }



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2011-01-17  1:29   ` Lin Ming
@ 2011-01-17  8:44     ` Stephane Eranian
  2011-01-17 10:51       ` Lin Ming
  0 siblings, 1 reply; 26+ messages in thread
From: Stephane Eranian @ 2011-01-17  8:44 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Mon, Jan 17, 2011 at 2:29 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> On Fri, 2011-01-14 at 01:14 +0800, Stephane Eranian wrote:
>> Lin,
>
> Hi, Stephane,
>
> Sorry for late response, I'm just back from vacation.
>
>>
>> On Thu, Dec 2, 2010 at 6:20 AM, Lin Ming <ming.m.lin@intel.com> wrote:
>> > +static void uncore_pmu_enable_all(int nmi_core)
>> > +{
>> > +       u64 ctrl;
>> > +
>> > +       ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
>> > +
>> > +       /* Route all interrupts to the first core that accesses uncore */
>> > +       ctrl |= 1ULL << (48 + nmi_core);
>> > +
>> > +       wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
>> > +}
>>
>> Are you sure nmi_core is always between 0-3 on a 4-core system and 0-5
>> on a 6-core system?
>> In other words, is that what topology_core_id(raw_smp_processor_id()) returns?
>
> I just have a look at a 6-core system, the core id is not 0-5
>
> $ cat /proc/cpuinfo |grep "core id"
> core id         : 0
> core id         : 1
> core id         : 2
> core id         : 8
> core id         : 9
> core id         : 10
>
> So we'd better route all the interrupts to the first core of the socket.
>
I recently realized the issue with 0,1,2,8,9,10. At the time I wrote the perfmon
support for uncore, those systems did not exist. Sparse APIC id is a major pain
for uncore PMU interrupt routine given the way UNC_GLOBAL_CTRL works.

Unfortunately, routing to core 0 (core_cpu_id=0) won't be enough in the
presence of HOTPLUG CPU. Imagine I disable the first three 3 cores.
Now you the cpu you have to play with are 8,9,10. You need to remap
to a number between 0-5.


> Thanks for the catch.
> Lin Ming
>
>>
>> Note that, unfortunately, I have not seen documentation that says on
>> 6-core system
>> UNC_GLOBAL_CTRL has 6 interrupt target bits, but it would make sense.
>>
>>
>> Otherwise, you will get a kernel panic when you wrmsr UNC_GLOBAL_CTRL.
>>
>> > +
>> > +       if (uncore->n_events == 1) {
>> > +               nmi_core = topology_core_id(raw_smp_processor_id());
>> > +               uncore->nmi_core = nmi_core;
>> > +               uncore_pmu_enable_all(nmi_core);
>> > +       }
>
>
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2011-01-17  8:44     ` Stephane Eranian
@ 2011-01-17 10:51       ` Lin Ming
  2011-01-17 10:56         ` Stephane Eranian
  0 siblings, 1 reply; 26+ messages in thread
From: Lin Ming @ 2011-01-17 10:51 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Mon, 2011-01-17 at 16:44 +0800, Stephane Eranian wrote:
> On Mon, Jan 17, 2011 at 2:29 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> > On Fri, 2011-01-14 at 01:14 +0800, Stephane Eranian wrote:
> >> Lin,
> >
> > Hi, Stephane,
> >
> > Sorry for late response, I'm just back from vacation.
> >
> >>
> >> On Thu, Dec 2, 2010 at 6:20 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> >> > +static void uncore_pmu_enable_all(int nmi_core)
> >> > +{
> >> > +       u64 ctrl;
> >> > +
> >> > +       ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
> >> > +
> >> > +       /* Route all interrupts to the first core that accesses uncore */
> >> > +       ctrl |= 1ULL << (48 + nmi_core);
> >> > +
> >> > +       wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
> >> > +}
> >>
> >> Are you sure nmi_core is always between 0-3 on a 4-core system and 0-5
> >> on a 6-core system?
> >> In other words, is that what topology_core_id(raw_smp_processor_id()) returns?
> >
> > I just have a look at a 6-core system, the core id is not 0-5
> >
> > $ cat /proc/cpuinfo |grep "core id"
> > core id         : 0
> > core id         : 1
> > core id         : 2
> > core id         : 8
> > core id         : 9
> > core id         : 10
> >
> > So we'd better route all the interrupts to the first core of the socket.
> >
> I recently realized the issue with 0,1,2,8,9,10. At the time I wrote the perfmon
> support for uncore, those systems did not exist. Sparse APIC id is a major pain
> for uncore PMU interrupt routine given the way UNC_GLOBAL_CTRL works.
> 
> Unfortunately, routing to core 0 (core_cpu_id=0) won't be enough in the
> presence of HOTPLUG CPU. Imagine I disable the first three 3 cores.
> Now you the cpu you have to play with are 8,9,10. You need to remap
> to a number between 0-5.

Good idea.

And I have confirmed that there are 6 interrupt target bits on my 6-core
Westmere machine, although documentation does not say this.

u64 val=0x3FULL << 48;
wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, val);

Thanks,
Lin Ming

> 
> 
> > Thanks for the catch.
> > Lin Ming
> >
> >>
> >> Note that, unfortunately, I have not seen documentation that says on
> >> 6-core system
> >> UNC_GLOBAL_CTRL has 6 interrupt target bits, but it would make sense.
> >>
> >>
> >> Otherwise, you will get a kernel panic when you wrmsr UNC_GLOBAL_CTRL.
> >>
> >> > +
> >> > +       if (uncore->n_events == 1) {
> >> > +               nmi_core = topology_core_id(raw_smp_processor_id());
> >> > +               uncore->nmi_core = nmi_core;
> >> > +               uncore_pmu_enable_all(nmi_core);
> >> > +       }
> >
> >
> >



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu
  2011-01-17 10:51       ` Lin Ming
@ 2011-01-17 10:56         ` Stephane Eranian
  0 siblings, 0 replies; 26+ messages in thread
From: Stephane Eranian @ 2011-01-17 10:56 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, Andi Kleen, Ingo Molnar, Frederic Weisbecker,
	Arjan van de Ven, lkml

On Mon, Jan 17, 2011 at 11:51 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> On Mon, 2011-01-17 at 16:44 +0800, Stephane Eranian wrote:
>> On Mon, Jan 17, 2011 at 2:29 AM, Lin Ming <ming.m.lin@intel.com> wrote:
>> > On Fri, 2011-01-14 at 01:14 +0800, Stephane Eranian wrote:
>> >> Lin,
>> >
>> > Hi, Stephane,
>> >
>> > Sorry for late response, I'm just back from vacation.
>> >
>> >>
>> >> On Thu, Dec 2, 2010 at 6:20 AM, Lin Ming <ming.m.lin@intel.com> wrote:
>> >> > +static void uncore_pmu_enable_all(int nmi_core)
>> >> > +{
>> >> > +       u64 ctrl;
>> >> > +
>> >> > +       ctrl = ((1 << UNCORE_NUM_GENERAL_COUNTERS) - 1) | MSR_UNCORE_PERF_GLOBAL_CTRL_EN_FC0;
>> >> > +
>> >> > +       /* Route all interrupts to the first core that accesses uncore */
>> >> > +       ctrl |= 1ULL << (48 + nmi_core);
>> >> > +
>> >> > +       wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, ctrl);
>> >> > +}
>> >>
>> >> Are you sure nmi_core is always between 0-3 on a 4-core system and 0-5
>> >> on a 6-core system?
>> >> In other words, is that what topology_core_id(raw_smp_processor_id()) returns?
>> >
>> > I just have a look at a 6-core system, the core id is not 0-5
>> >
>> > $ cat /proc/cpuinfo |grep "core id"
>> > core id         : 0
>> > core id         : 1
>> > core id         : 2
>> > core id         : 8
>> > core id         : 9
>> > core id         : 10
>> >
>> > So we'd better route all the interrupts to the first core of the socket.
>> >
>> I recently realized the issue with 0,1,2,8,9,10. At the time I wrote the perfmon
>> support for uncore, those systems did not exist. Sparse APIC id is a major pain
>> for uncore PMU interrupt routine given the way UNC_GLOBAL_CTRL works.
>>
>> Unfortunately, routing to core 0 (core_cpu_id=0) won't be enough in the
>> presence of HOTPLUG CPU. Imagine I disable the first three 3 cores.
>> Now you the cpu you have to play with are 8,9,10. You need to remap
>> to a number between 0-5.
>
> Good idea.
>
> And I have confirmed that there are 6 interrupt target bits on my 6-core
> Westmere machine, although documentation does not say this.
>
> u64 val=0x3FULL << 48;
> wrmsrl(MSR_UNCORE_PERF_GLOBAL_CTRL, val);
>
Yes, at least, this part looks good ;->


> Thanks,
> Lin Ming
>
>>
>>
>> > Thanks for the catch.
>> > Lin Ming
>> >
>> >>
>> >> Note that, unfortunately, I have not seen documentation that says on
>> >> 6-core system
>> >> UNC_GLOBAL_CTRL has 6 interrupt target bits, but it would make sense.
>> >>
>> >>
>> >> Otherwise, you will get a kernel panic when you wrmsr UNC_GLOBAL_CTRL.
>> >>
>> >> > +
>> >> > +       if (uncore->n_events == 1) {
>> >> > +               nmi_core = topology_core_id(raw_smp_processor_id());
>> >> > +               uncore->nmi_core = nmi_core;
>> >> > +               uncore_pmu_enable_all(nmi_core);
>> >> > +       }
>> >
>> >
>> >
>
>
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2011-01-17 10:56 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-02  5:20 [RFC PATCH 2/3 v3] perf: Implement Nehalem uncore pmu Lin Ming
2010-12-02  5:57 ` Lin Ming
2010-12-07  6:15 ` Lin Ming
2010-12-09 19:17   ` Peter Zijlstra
2010-12-09 19:21 ` Peter Zijlstra
2010-12-09 20:15   ` Stephane Eranian
2010-12-09 20:19     ` Peter Zijlstra
2010-12-09 20:27       ` Stephane Eranian
2010-12-09 23:46   ` Stephane Eranian
2010-12-10  8:31     ` Lin Ming
2010-12-10 10:47     ` Peter Zijlstra
2010-12-10 10:52       ` Peter Zijlstra
2010-12-10 15:11         ` Cyrill Gorcunov
2010-12-11  5:49       ` Stephane Eranian
2010-12-13  8:27         ` Lin Ming
2010-12-13 16:42           ` Stephane Eranian
2010-12-13 16:51             ` Andi Kleen
2010-12-13 19:04               ` Stephane Eranian
2010-12-10  8:28   ` Lin Ming
2010-12-09 19:24 ` Peter Zijlstra
2010-12-10  8:28   ` Lin Ming
2011-01-13 17:14 ` Stephane Eranian
2011-01-17  1:29   ` Lin Ming
2011-01-17  8:44     ` Stephane Eranian
2011-01-17 10:51       ` Lin Ming
2011-01-17 10:56         ` Stephane Eranian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.