From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933875AbbBITRy (ORCPT ); Mon, 9 Feb 2015 14:17:54 -0500 Received: from mga14.intel.com ([192.55.52.115]:53592 "EHLO mga14.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933756AbbBITRw (ORCPT ); Mon, 9 Feb 2015 14:17:52 -0500 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.09,545,1418112000"; d="scan'208";a="683251227" From: Andi Kleen To: peterz@infradead.org Cc: linux-kernel@vger.kernel.org, kan.liang@intel.com, Andi Kleen Subject: [PATCH 1/3] perf, x86: Add new cache events table for Haswell Date: Mon, 9 Feb 2015 11:17:44 -0800 Message-Id: <1423509466-24430-1-git-send-email-andi@firstfloor.org> X-Mailer: git-send-email 1.9.3 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org From: Andi Kleen Haswell offcore events are quite different from Sandy Bridge. Add a new table to handle Haswell properly. Note that the offcore bits listed in the SDM are not quite correct (this is currently being fixed). An uptodate list of bits is in the patch. The basic setup is similar to Sandy Bridge. The prefetch columns have been removed, as prefetch counting is not very reliable on Haswell. One L1 event that is not in the event list anymore has been also removed. - data reads do not include code reads (comparable to earlier Sandy Bridge tables) - data counts include speculative execution (except L1 write, dtlb, bpu) - remote node access includes both remote memory, remote cache, remote mmio. - prefetches are not included in the counts for consistency (different from Sandy Bridge, which includes prefetches in the remote node) The events with additional caveats have references to the specification update. v2: Change name of variable. v3: Based on older Broadwell patch, but now just for Haswell. Various fixes with events being more similar to the older Sandy Bridge tables. Signed-off-by: Andi Kleen --- arch/x86/kernel/cpu/perf_event_intel.c | 197 ++++++++++++++++++++++++++++++++- 1 file changed, 195 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 498b6d9..387f580 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids }; +/* + * Notes on the events: + * - data reads do not include code reads (comparable to earlier tables) + * - data counts include speculative execution (except L1 write, dtlb, bpu) + * - remote node access includes both remote memory, remote cache, remote mmio. + * - prefetches are not included in the counts. + * The events with additional caveats have references to the specification update. + */ + +#define HSW_DEMAND_DATA_RD BIT(0) +#define HSW_DEMAND_RFO BIT(1) +#define HSW_PF_L2_RFO BIT(5) +#define HSW_PF_L3_RFO BIT(8) +#define HSW_ALL_RFO (HSW_DEMAND_RFO| \ + HSW_PF_L2_RFO|HSW_PF_L3_RFO) +#define HSW_ANY_RESPONSE BIT(16) +#define HSW_SUPPLIER_NONE BIT(17) +#define HSW_L3_MISS_LOCAL BIT(22) +#define HSW_L3_MISS_REMOTE_HOP0 BIT(27) +#define HSW_L3_MISS_REMOTE_HOP1 BIT(28) +#define HSW_L3_MISS_REMOTE_HOP2P BIT(29) +#define HSW_L3_MISS (HSW_L3_MISS_LOCAL| \ + HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \ + HSW_L3_MISS_REMOTE_HOP2P) +#define HSW_SNOOP_NONE BIT(31) +#define HSW_SNOOP_NOT_NEEDED BIT(32) +#define HSW_SNOOP_MISS BIT(33) +#define HSW_SNOOP_HIT_NO_FWD BIT(34) +#define HSW_SNOOP_HIT_WITH_FWD BIT(35) +#define HSW_SNOOP_HITM BIT(36) +#define HSW_SNOOP_NON_DRAM BIT(37) +#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \ + HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \ + HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \ + HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM) + +static __initconst const u64 hsw_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */ + [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */ + [ C(RESULT_MISS) ] = 0x0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */ + [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */ + [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + +static __initconst const u64 hsw_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD| + HSW_ANY_RESPONSE|HSW_ANY_SNOOP| + HSW_SUPPLIER_NONE, + [ C(RESULT_MISS) ] = HSW_DEMAND_DATA_RD| + HSW_L3_MISS|HSW_ANY_SNOOP| + HSW_SUPPLIER_NONE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO| + HSW_ANY_RESPONSE|HSW_ANY_SNOOP| + HSW_SUPPLIER_NONE, + [ C(RESULT_MISS) ] = HSW_DEMAND_RFO|HSW_L3_MISS| + HSW_ANY_SNOOP|HSW_SUPPLIER_NONE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD| + HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE| + HSW_ANY_SNOOP, + [ C(RESULT_MISS) ] = HSW_DEMAND_DATA_RD| + HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| + HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE| + HSW_ANY_SNOOP, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO| + HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE| + HSW_ANY_SNOOP, + [ C(RESULT_MISS) ] = HSW_DEMAND_RFO| + HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| + HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE| + HSW_ANY_SNOOP, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -2546,8 +2739,8 @@ __init int intel_pmu_init(void) case 69: /* 22nm Haswell ULT */ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ x86_pmu.late_ack = true; - memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_snb(); -- 1.9.3