All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] intel_perf_counters: a little tool for dumping performance counters.
@ 2013-03-27  5:06 Kenneth Graunke
  2013-03-27  5:06 ` [PATCH 2/3] intel_perf_counters: Abstract out Ironlake-specific code Kenneth Graunke
  2013-03-27  5:06 ` [PATCH 3/3] intel_perf_counters: Add support for Sandybridge Kenneth Graunke
  0 siblings, 2 replies; 4+ messages in thread
From: Kenneth Graunke @ 2013-03-27  5:06 UTC (permalink / raw)
  To: intel-gfx

From: Eric Anholt <eric@anholt.net>

This reads the GPU's performance counters via MI_REPORT_PERF_COUNT and
prints them in a top-style interface.  While it can be useful in and of
itself, it also documents the performance counters and lets us verify
that they're working.

Currently, it only supports Ironlake.

v2 [Ken]: Rebase on master and fix compilation failures; make it abort
on non-Ironlake platforms to avoid GPU hangs; rename from 'chaps' to
intel_perf_counters since that acronym isn't used any longer; write the
above commit message.
---
 tools/Makefile.am           |   1 +
 tools/intel_perf_counters.c | 175 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 176 insertions(+)
 create mode 100644 tools/intel_perf_counters.c

diff --git a/tools/Makefile.am b/tools/Makefile.am
index bb3328f..e939518 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -12,6 +12,7 @@ bin_PROGRAMS = 				\
 	intel_gpu_top 			\
 	intel_gpu_time 			\
 	intel_gtt 			\
+	intel_perf_counters		\
 	intel_stepping 			\
 	intel_reg_checker 		\
 	intel_reg_dumper 		\
diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
new file mode 100644
index 0000000..53d2ad7
--- /dev/null
+++ b/tools/intel_perf_counters.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright © 2010, 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <err.h>
+#include <sys/ioctl.h>
+
+#include "drm.h"
+#include "i915_drm.h"
+#include "drmtest.h"
+#include "intel_gpu_tools.h"
+#include "intel_bufmgr.h"
+#include "intel_batchbuffer.h"
+
+#define COUNTER_COUNT 29
+
+const char *counter_name[COUNTER_COUNT] = {
+	"cycles the CS unit is starved",
+	"cycles the CS unit is stalled",
+	"cycles the VF unit is starved",
+	"cycles the VF unit is stalled",
+	"cycles the VS unit is starved",
+	"cycles the VS unit is stalled",
+	"cycles the GS unit is starved",
+	"cycles the GS unit is stalled",
+	"cycles the CL unit is starved",
+	"cycles the CL unit is stalled",
+	"cycles the SF unit is starved",
+	"cycles the SF unit is stalled",
+	"cycles the WZ unit is starved",
+	"cycles the WZ unit is stalled",
+	"Z buffer read/write          ",
+	"cycles each EU was active    ",
+	"cycles each EU was suspended ",
+	"cycles threads loaded all EUs",
+	"cycles filtering active      ",
+	"cycles PS threads executed   ",
+	"subspans written to RC       ",
+	"bytes read for texture reads ",
+	"texels returned from sampler ",
+	"polygons not culled          ",
+	"clocks MASF has valid message",
+	"64b writes/reads from RC     ",
+	"reads on dataport            ",
+	"clocks MASF has valid msg not consumed by sampler",
+	"cycles any EU is stalled for math",
+};
+
+int have_totals = 0;
+uint32_t totals[COUNTER_COUNT];
+uint32_t last_counter[COUNTER_COUNT];
+static drm_intel_bufmgr *bufmgr;
+struct intel_batchbuffer *batch;
+
+/* DW0 */
+#define MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2))
+#define MI_COUNTER_SET_0	(0 << 6)
+#define MI_COUNTER_SET_1	(1 << 6)
+/* DW1 */
+#define MI_COUNTER_ADDRESS_GTT	(1 << 0)
+/* DW2: report ID */
+
+static void
+get_counters(void)
+{
+	int i;
+	drm_intel_bo *stats_bo;
+	uint32_t *stats_result;
+
+	stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096);
+
+	BEGIN_BATCH(6);
+	OUT_BATCH(MI_REPORT_PERF_COUNT | MI_COUNTER_SET_0);
+	OUT_RELOC(stats_bo,
+		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+		  0);
+	OUT_BATCH(0);
+
+	OUT_BATCH(MI_REPORT_PERF_COUNT | MI_COUNTER_SET_1);
+	OUT_RELOC(stats_bo,
+		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+		  64);
+	OUT_BATCH(0);
+
+	ADVANCE_BATCH();
+
+	intel_batchbuffer_flush(batch);
+
+	drm_intel_bo_map(stats_bo, 0);
+	stats_result = stats_bo->virtual;
+	/* skip REPORT_ID, TIMESTAMP */
+	stats_result += 3;
+	for (i = 0 ; i < COUNTER_COUNT; i++) {
+		totals[i] += stats_result[i] - last_counter[i];
+		last_counter[i] = stats_result[i];
+	}
+
+	drm_intel_bo_unmap(stats_bo);
+	drm_intel_bo_unreference(stats_bo);
+}
+
+#define STATS_CHECK_FREQUENCY	100
+#define STATS_REPORT_FREQUENCY	2
+
+int
+main(int argc, char **argv)
+{
+	uint32_t devid;
+	int i;
+	char clear_screen[] = {0x1b, '[', 'H',
+			       0x1b, '[', 'J',
+			       0x0};
+	int fd;
+	int l;
+
+	fd = drm_open_any();
+	devid = intel_get_drm_devid(fd);
+
+	bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
+	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+	batch = intel_batchbuffer_alloc(bufmgr, devid);
+
+	if (!IS_GEN5(devid)) {
+		printf("This tool is only for Ironlake.\n");
+		abort();
+	}
+
+	for (;;) {
+		for (l = 0; l < STATS_CHECK_FREQUENCY; l++) {
+			printf("%s", clear_screen);
+
+			if (l % (STATS_CHECK_FREQUENCY / STATS_REPORT_FREQUENCY) == 0) {
+				if (have_totals) {
+					for (i = 0; i < COUNTER_COUNT; i++) {
+						printf("%s: %u\n", counter_name[i],
+						       totals[i]);
+						totals[i] = 0;
+					}
+				}
+			}
+
+			get_counters();
+			have_totals = 1;
+
+			usleep(1000000 / STATS_CHECK_FREQUENCY);
+		}
+	}
+
+	return 0;
+}
-- 
1.8.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] intel_perf_counters: Abstract out Ironlake-specific code.
  2013-03-27  5:06 [PATCH 1/3] intel_perf_counters: a little tool for dumping performance counters Kenneth Graunke
@ 2013-03-27  5:06 ` Kenneth Graunke
  2013-03-27  5:06 ` [PATCH 3/3] intel_perf_counters: Add support for Sandybridge Kenneth Graunke
  1 sibling, 0 replies; 4+ messages in thread
From: Kenneth Graunke @ 2013-03-27  5:06 UTC (permalink / raw)
  To: intel-gfx

We want to support this tool on more platforms.  This lays the
groundwork for making that possible.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
---
 tools/intel_perf_counters.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
index 53d2ad7..fd268b1 100644
--- a/tools/intel_perf_counters.c
+++ b/tools/intel_perf_counters.c
@@ -37,9 +37,9 @@
 #include "intel_bufmgr.h"
 #include "intel_batchbuffer.h"
 
-#define COUNTER_COUNT 29
+#define GEN5_COUNTER_COUNT 29
 
-const char *counter_name[COUNTER_COUNT] = {
+const char *gen5_counter_names[GEN5_COUNTER_COUNT] = {
 	"cycles the CS unit is starved",
 	"cycles the CS unit is stalled",
 	"cycles the VF unit is starved",
@@ -72,13 +72,13 @@ const char *counter_name[COUNTER_COUNT] = {
 };
 
 int have_totals = 0;
-uint32_t totals[COUNTER_COUNT];
-uint32_t last_counter[COUNTER_COUNT];
+uint32_t *totals;
+uint32_t *last_counter;
 static drm_intel_bufmgr *bufmgr;
 struct intel_batchbuffer *batch;
 
 /* DW0 */
-#define MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2))
+#define GEN5_MI_REPORT_PERF_COUNT ((0x26 << 23) | (3 - 2))
 #define MI_COUNTER_SET_0	(0 << 6)
 #define MI_COUNTER_SET_1	(1 << 6)
 /* DW1 */
@@ -86,7 +86,7 @@ struct intel_batchbuffer *batch;
 /* DW2: report ID */
 
 static void
-get_counters(void)
+gen5_get_counters(void)
 {
 	int i;
 	drm_intel_bo *stats_bo;
@@ -95,13 +95,13 @@ get_counters(void)
 	stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096);
 
 	BEGIN_BATCH(6);
-	OUT_BATCH(MI_REPORT_PERF_COUNT | MI_COUNTER_SET_0);
+	OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | MI_COUNTER_SET_0);
 	OUT_RELOC(stats_bo,
 		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 		  0);
 	OUT_BATCH(0);
 
-	OUT_BATCH(MI_REPORT_PERF_COUNT | MI_COUNTER_SET_1);
+	OUT_BATCH(GEN5_MI_REPORT_PERF_COUNT | MI_COUNTER_SET_1);
 	OUT_RELOC(stats_bo,
 		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
 		  64);
@@ -115,7 +115,7 @@ get_counters(void)
 	stats_result = stats_bo->virtual;
 	/* skip REPORT_ID, TIMESTAMP */
 	stats_result += 3;
-	for (i = 0 ; i < COUNTER_COUNT; i++) {
+	for (i = 0 ; i < GEN5_COUNTER_COUNT; i++) {
 		totals[i] += stats_result[i] - last_counter[i];
 		last_counter[i] = stats_result[i];
 	}
@@ -131,6 +131,9 @@ int
 main(int argc, char **argv)
 {
 	uint32_t devid;
+	int counter_count;
+	const char **counter_name;
+	void (*get_counters)(void);
 	int i;
 	char clear_screen[] = {0x1b, '[', 'H',
 			       0x1b, '[', 'J',
@@ -145,10 +148,16 @@ main(int argc, char **argv)
 	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
 	batch = intel_batchbuffer_alloc(bufmgr, devid);
 
-	if (!IS_GEN5(devid)) {
-		printf("This tool is only for Ironlake.\n");
+	if (IS_GEN5(devid)) {
+		counter_name = gen5_counter_names;
+		counter_count = GEN5_COUNTER_COUNT;
+		get_counters = gen5_get_counters;
+	} else {
+		printf("This tool is not yet supported on your platform.\n");
 		abort();
 	}
+	totals = calloc(counter_count, sizeof(uint32_t));
+	last_counter = calloc(counter_count, sizeof(uint32_t));
 
 	for (;;) {
 		for (l = 0; l < STATS_CHECK_FREQUENCY; l++) {
@@ -156,7 +165,7 @@ main(int argc, char **argv)
 
 			if (l % (STATS_CHECK_FREQUENCY / STATS_REPORT_FREQUENCY) == 0) {
 				if (have_totals) {
-					for (i = 0; i < COUNTER_COUNT; i++) {
+					for (i = 0; i < counter_count; i++) {
 						printf("%s: %u\n", counter_name[i],
 						       totals[i]);
 						totals[i] = 0;
@@ -171,5 +180,8 @@ main(int argc, char **argv)
 		}
 	}
 
+	free(totals);
+	free(last_counter);
+
 	return 0;
 }
-- 
1.8.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] intel_perf_counters: Add support for Sandybridge.
  2013-03-27  5:06 [PATCH 1/3] intel_perf_counters: a little tool for dumping performance counters Kenneth Graunke
  2013-03-27  5:06 ` [PATCH 2/3] intel_perf_counters: Abstract out Ironlake-specific code Kenneth Graunke
@ 2013-03-27  5:06 ` Kenneth Graunke
  2013-03-27 11:29   ` Daniel Vetter
  1 sibling, 1 reply; 4+ messages in thread
From: Kenneth Graunke @ 2013-03-27  5:06 UTC (permalink / raw)
  To: intel-gfx

While the Sandybridge PRM doesn't have any documentation on the GPU's
performance counters, a lot of information can be gleaned from the older
Ironlake PRM.  Oddly, none of the information documented there actually
appears to apply to Ironlake.  However, it apparently works just great
on Sandybridge.

Since this information has all been publicly available on the internet
for around three years, we can use it.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
---
 tools/intel_perf_counters.c | 146 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
index fd268b1..b528361 100644
--- a/tools/intel_perf_counters.c
+++ b/tools/intel_perf_counters.c
@@ -22,9 +22,21 @@
  *
  * Authors:
  *    Eric Anholt <eric@anholt.net>
+ *    Kenneth Graunke <kenneth@whitecape.org>
+ *
+ * While documentation for performance counters is suspiciously missing from the
+ * Sandybridge PRM, they were documented in Volume 1 Part 3 of the Ironlake PRM.
+ *
+ * A lot of the Ironlake PRM actually unintentionally documents Sandybridge
+ * due to mistakes made when updating the documentation for Gen6+.  Many of
+ * these mislabeled sections carried forward to the public documentation.
+ *
+ * The Ironlake PRMs have been publicly available since 2010 and are online at:
+ * https://01.org/linuxgraphics/documentation/2010-intel-core-processor-family
  */
 
 #include <unistd.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <err.h>
@@ -71,6 +83,60 @@ const char *gen5_counter_names[GEN5_COUNTER_COUNT] = {
 	"cycles any EU is stalled for math",
 };
 
+#define GEN6_COUNTER_COUNT 29
+
+/**
+ * Sandybridge: Counter Select = 001
+ * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
+ * A5   A6   A7   A8   A9   A10  A11  A12
+ * A13  A14  A15  A16  A17  A18  A19  A20
+ * A21  A22  A23  A24  A25  A26  A27  A28
+ */
+const int gen6_counter_format = 1;
+
+/**
+ * Names for aggregating counters A0-A28.
+ *
+ * While the Ironlake PRM clearly documents that there are 29 counters (A0-A28),
+ * it only lists the names for 28 of them; one is missing.  However, careful
+ * examination reveals a pattern: there are five GS counters (Active, Stall,
+ * Core Stall, # threads loaded, and ready but not running time).  There are
+ * also five PS counters, in the same order.  But there are only four VS
+ * counters listed - the number of VS threads loaded is missing.  Presumably,
+ * it exists and is counter 5, and the rest are shifted over one place.
+ */
+const char *gen6_counter_names[GEN6_COUNTER_COUNT] = {
+	[0]  = "Aggregated Core Array Active",
+	[1]  = "Aggregated Core Array Stalled",
+	[2]  = "Vertex Shader Active Time",
+	[3]  = "Vertex Shader Stall Time",
+	[4]  = "Vertex Shader Stall Time - Core Stall",
+	[5]  = "# VS threads loaded",
+	[6]  = "Vertex Shader Ready but not running time",
+	[7]  = "Geometry Shader Active Time",
+	[8]  = "Geometry Shader Stall Time",
+	[9]  = "Geometry Shader Stall Time - Core Stall",
+	[10] = "# GS threads loaded",
+	[11] = "Geometry Shader ready but not running Time",
+	[12] = "Pixel Shader Active Time",
+	[13] = "Pixel Shader Stall Time",
+	[14] = "Pixel Shader Stall Time - Core Stall",
+	[15] = "# PS threads loaded",
+	[16] = "Pixel Shader ready but not running Time",
+	[17] = "Early Z Test Pixels Passing",
+	[18] = "Early Z Test Pixels Failing",
+	[19] = "Early Stencil Test Pixels Passing",
+	[20] = "Early Stencil Test Pixels Failing",
+	[21] = "Pixel Kill Count",
+	[22] = "Alpha Test Pixels Failed",
+	[23] = "Post PS Stencil Pixels Failed",
+	[24] = "Post PS Z buffer Pixels Failed",
+	[25] = "Pixels/samples Written in the frame buffer",
+	[26] = "GPU Busy",
+	[27] = "CL active and not stalled",
+	[28] = "SF active and stalled",
+};
+
 int have_totals = 0;
 uint32_t *totals;
 uint32_t *last_counter;
@@ -85,6 +151,20 @@ struct intel_batchbuffer *batch;
 #define MI_COUNTER_ADDRESS_GTT	(1 << 0)
 /* DW2: report ID */
 
+/**
+ * According to the Sandybridge PRM, Volume 1, Part 1, page 48,
+ * MI_REPORT_PERF_COUNT is now opcode 0x28.  The Ironlake PRM, Volume 1,
+ * Part 3 details how it works.
+ */
+/* DW0 */
+#define GEN6_MI_REPORT_PERF_COUNT (0x28 << 23)
+/* DW1 and 2 are the same as above */
+
+/* OACONTROL exists on Gen6+ but is documented in the Ironlake PRM */
+#define OACONTROL                       0x2360
+# define OACONTROL_COUNTER_SELECT_SHIFT 2
+# define PERFORMANCE_COUNTER_ENABLE     (1 << 0)
+
 static void
 gen5_get_counters(void)
 {
@@ -124,6 +204,45 @@ gen5_get_counters(void)
 	drm_intel_bo_unreference(stats_bo);
 }
 
+static void
+gen6_get_counters(void)
+{
+	int i;
+	drm_intel_bo *stats_bo;
+	uint32_t *stats_result;
+
+	/* Map from counter names to their index in the buffer object */
+	static const int buffer_index[GEN6_COUNTER_COUNT] =
+	{
+		7,   6,  5,  4,  3,
+		15, 14, 13, 12, 11, 10,  9,  8,
+		23, 22, 21, 20, 19, 18, 17, 16,
+		31, 30, 29, 28, 27, 26, 25, 24,
+	};
+
+	stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096);
+
+	BEGIN_BATCH(3);
+	OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT | (3 - 2));
+	OUT_RELOC(stats_bo,
+		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+		  MI_COUNTER_ADDRESS_GTT);
+	OUT_BATCH(0);
+	ADVANCE_BATCH();
+
+	intel_batchbuffer_flush_on_ring(batch, I915_EXEC_RENDER);
+
+	drm_intel_bo_map(stats_bo, 0);
+	stats_result = stats_bo->virtual;
+	for (i = 0; i < GEN6_COUNTER_COUNT; i++) {
+		totals[i] += stats_result[buffer_index[i]] - last_counter[i];
+		last_counter[i] = stats_result[buffer_index[i]];
+	}
+
+	drm_intel_bo_unmap(stats_bo);
+	drm_intel_bo_unreference(stats_bo);
+}
+
 #define STATS_CHECK_FREQUENCY	100
 #define STATS_REPORT_FREQUENCY	2
 
@@ -131,6 +250,7 @@ int
 main(int argc, char **argv)
 {
 	uint32_t devid;
+	int counter_format;
 	int counter_count;
 	const char **counter_name;
 	void (*get_counters)(void);
@@ -138,6 +258,7 @@ main(int argc, char **argv)
 	char clear_screen[] = {0x1b, '[', 'H',
 			       0x1b, '[', 'J',
 			       0x0};
+	bool oacontrol = true;
 	int fd;
 	int l;
 
@@ -152,10 +273,27 @@ main(int argc, char **argv)
 		counter_name = gen5_counter_names;
 		counter_count = GEN5_COUNTER_COUNT;
 		get_counters = gen5_get_counters;
+		oacontrol = false;
+	} else if (IS_GEN6(devid)) {
+		counter_name = gen6_counter_names;
+		counter_count = GEN6_COUNTER_COUNT;
+		counter_format = gen6_counter_format;
+		get_counters = gen6_get_counters;
 	} else {
 		printf("This tool is not yet supported on your platform.\n");
 		abort();
 	}
+
+	if (oacontrol) {
+		/* Forcewake */
+		intel_register_access_init(intel_get_pci_device(), 0);
+
+		/* Enable performance counters */
+		intel_register_write(OACONTROL,
+			counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
+			PERFORMANCE_COUNTER_ENABLE);
+	}
+
 	totals = calloc(counter_count, sizeof(uint32_t));
 	last_counter = calloc(counter_count, sizeof(uint32_t));
 
@@ -180,6 +318,14 @@ main(int argc, char **argv)
 		}
 	}
 
+	if (oacontrol) {
+		/* Disable performance counters */
+		intel_register_write(OACONTROL, 0);
+
+		/* Forcewake */
+		intel_register_access_fini();
+	}
+
 	free(totals);
 	free(last_counter);
 
-- 
1.8.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 3/3] intel_perf_counters: Add support for Sandybridge.
  2013-03-27  5:06 ` [PATCH 3/3] intel_perf_counters: Add support for Sandybridge Kenneth Graunke
@ 2013-03-27 11:29   ` Daniel Vetter
  0 siblings, 0 replies; 4+ messages in thread
From: Daniel Vetter @ 2013-03-27 11:29 UTC (permalink / raw)
  To: Kenneth Graunke; +Cc: intel-gfx

On Tue, Mar 26, 2013 at 10:06:39PM -0700, Kenneth Graunke wrote:
> While the Sandybridge PRM doesn't have any documentation on the GPU's
> performance counters, a lot of information can be gleaned from the older
> Ironlake PRM.  Oddly, none of the information documented there actually
> appears to apply to Ironlake.  However, it apparently works just great
> on Sandybridge.
> 
> Since this information has all been publicly available on the internet
> for around three years, we can use it.
> 
> Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>

Merged, thanks for the patches.
-Daniel

> ---
>  tools/intel_perf_counters.c | 146 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 146 insertions(+)
> 
> diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
> index fd268b1..b528361 100644
> --- a/tools/intel_perf_counters.c
> +++ b/tools/intel_perf_counters.c
> @@ -22,9 +22,21 @@
>   *
>   * Authors:
>   *    Eric Anholt <eric@anholt.net>
> + *    Kenneth Graunke <kenneth@whitecape.org>
> + *
> + * While documentation for performance counters is suspiciously missing from the
> + * Sandybridge PRM, they were documented in Volume 1 Part 3 of the Ironlake PRM.
> + *
> + * A lot of the Ironlake PRM actually unintentionally documents Sandybridge
> + * due to mistakes made when updating the documentation for Gen6+.  Many of
> + * these mislabeled sections carried forward to the public documentation.
> + *
> + * The Ironlake PRMs have been publicly available since 2010 and are online at:
> + * https://01.org/linuxgraphics/documentation/2010-intel-core-processor-family
>   */
>  
>  #include <unistd.h>
> +#include <stdbool.h>
>  #include <stdlib.h>
>  #include <stdio.h>
>  #include <err.h>
> @@ -71,6 +83,60 @@ const char *gen5_counter_names[GEN5_COUNTER_COUNT] = {
>  	"cycles any EU is stalled for math",
>  };
>  
> +#define GEN6_COUNTER_COUNT 29
> +
> +/**
> + * Sandybridge: Counter Select = 001
> + * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
> + * A5   A6   A7   A8   A9   A10  A11  A12
> + * A13  A14  A15  A16  A17  A18  A19  A20
> + * A21  A22  A23  A24  A25  A26  A27  A28
> + */
> +const int gen6_counter_format = 1;
> +
> +/**
> + * Names for aggregating counters A0-A28.
> + *
> + * While the Ironlake PRM clearly documents that there are 29 counters (A0-A28),
> + * it only lists the names for 28 of them; one is missing.  However, careful
> + * examination reveals a pattern: there are five GS counters (Active, Stall,
> + * Core Stall, # threads loaded, and ready but not running time).  There are
> + * also five PS counters, in the same order.  But there are only four VS
> + * counters listed - the number of VS threads loaded is missing.  Presumably,
> + * it exists and is counter 5, and the rest are shifted over one place.
> + */
> +const char *gen6_counter_names[GEN6_COUNTER_COUNT] = {
> +	[0]  = "Aggregated Core Array Active",
> +	[1]  = "Aggregated Core Array Stalled",
> +	[2]  = "Vertex Shader Active Time",
> +	[3]  = "Vertex Shader Stall Time",
> +	[4]  = "Vertex Shader Stall Time - Core Stall",
> +	[5]  = "# VS threads loaded",
> +	[6]  = "Vertex Shader Ready but not running time",
> +	[7]  = "Geometry Shader Active Time",
> +	[8]  = "Geometry Shader Stall Time",
> +	[9]  = "Geometry Shader Stall Time - Core Stall",
> +	[10] = "# GS threads loaded",
> +	[11] = "Geometry Shader ready but not running Time",
> +	[12] = "Pixel Shader Active Time",
> +	[13] = "Pixel Shader Stall Time",
> +	[14] = "Pixel Shader Stall Time - Core Stall",
> +	[15] = "# PS threads loaded",
> +	[16] = "Pixel Shader ready but not running Time",
> +	[17] = "Early Z Test Pixels Passing",
> +	[18] = "Early Z Test Pixels Failing",
> +	[19] = "Early Stencil Test Pixels Passing",
> +	[20] = "Early Stencil Test Pixels Failing",
> +	[21] = "Pixel Kill Count",
> +	[22] = "Alpha Test Pixels Failed",
> +	[23] = "Post PS Stencil Pixels Failed",
> +	[24] = "Post PS Z buffer Pixels Failed",
> +	[25] = "Pixels/samples Written in the frame buffer",
> +	[26] = "GPU Busy",
> +	[27] = "CL active and not stalled",
> +	[28] = "SF active and stalled",
> +};
> +
>  int have_totals = 0;
>  uint32_t *totals;
>  uint32_t *last_counter;
> @@ -85,6 +151,20 @@ struct intel_batchbuffer *batch;
>  #define MI_COUNTER_ADDRESS_GTT	(1 << 0)
>  /* DW2: report ID */
>  
> +/**
> + * According to the Sandybridge PRM, Volume 1, Part 1, page 48,
> + * MI_REPORT_PERF_COUNT is now opcode 0x28.  The Ironlake PRM, Volume 1,
> + * Part 3 details how it works.
> + */
> +/* DW0 */
> +#define GEN6_MI_REPORT_PERF_COUNT (0x28 << 23)
> +/* DW1 and 2 are the same as above */
> +
> +/* OACONTROL exists on Gen6+ but is documented in the Ironlake PRM */
> +#define OACONTROL                       0x2360
> +# define OACONTROL_COUNTER_SELECT_SHIFT 2
> +# define PERFORMANCE_COUNTER_ENABLE     (1 << 0)
> +
>  static void
>  gen5_get_counters(void)
>  {
> @@ -124,6 +204,45 @@ gen5_get_counters(void)
>  	drm_intel_bo_unreference(stats_bo);
>  }
>  
> +static void
> +gen6_get_counters(void)
> +{
> +	int i;
> +	drm_intel_bo *stats_bo;
> +	uint32_t *stats_result;
> +
> +	/* Map from counter names to their index in the buffer object */
> +	static const int buffer_index[GEN6_COUNTER_COUNT] =
> +	{
> +		7,   6,  5,  4,  3,
> +		15, 14, 13, 12, 11, 10,  9,  8,
> +		23, 22, 21, 20, 19, 18, 17, 16,
> +		31, 30, 29, 28, 27, 26, 25, 24,
> +	};
> +
> +	stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096);
> +
> +	BEGIN_BATCH(3);
> +	OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT | (3 - 2));
> +	OUT_RELOC(stats_bo,
> +		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> +		  MI_COUNTER_ADDRESS_GTT);
> +	OUT_BATCH(0);
> +	ADVANCE_BATCH();
> +
> +	intel_batchbuffer_flush_on_ring(batch, I915_EXEC_RENDER);
> +
> +	drm_intel_bo_map(stats_bo, 0);
> +	stats_result = stats_bo->virtual;
> +	for (i = 0; i < GEN6_COUNTER_COUNT; i++) {
> +		totals[i] += stats_result[buffer_index[i]] - last_counter[i];
> +		last_counter[i] = stats_result[buffer_index[i]];
> +	}
> +
> +	drm_intel_bo_unmap(stats_bo);
> +	drm_intel_bo_unreference(stats_bo);
> +}
> +
>  #define STATS_CHECK_FREQUENCY	100
>  #define STATS_REPORT_FREQUENCY	2
>  
> @@ -131,6 +250,7 @@ int
>  main(int argc, char **argv)
>  {
>  	uint32_t devid;
> +	int counter_format;
>  	int counter_count;
>  	const char **counter_name;
>  	void (*get_counters)(void);
> @@ -138,6 +258,7 @@ main(int argc, char **argv)
>  	char clear_screen[] = {0x1b, '[', 'H',
>  			       0x1b, '[', 'J',
>  			       0x0};
> +	bool oacontrol = true;
>  	int fd;
>  	int l;
>  
> @@ -152,10 +273,27 @@ main(int argc, char **argv)
>  		counter_name = gen5_counter_names;
>  		counter_count = GEN5_COUNTER_COUNT;
>  		get_counters = gen5_get_counters;
> +		oacontrol = false;
> +	} else if (IS_GEN6(devid)) {
> +		counter_name = gen6_counter_names;
> +		counter_count = GEN6_COUNTER_COUNT;
> +		counter_format = gen6_counter_format;
> +		get_counters = gen6_get_counters;
>  	} else {
>  		printf("This tool is not yet supported on your platform.\n");
>  		abort();
>  	}
> +
> +	if (oacontrol) {
> +		/* Forcewake */
> +		intel_register_access_init(intel_get_pci_device(), 0);
> +
> +		/* Enable performance counters */
> +		intel_register_write(OACONTROL,
> +			counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
> +			PERFORMANCE_COUNTER_ENABLE);
> +	}
> +
>  	totals = calloc(counter_count, sizeof(uint32_t));
>  	last_counter = calloc(counter_count, sizeof(uint32_t));
>  
> @@ -180,6 +318,14 @@ main(int argc, char **argv)
>  		}
>  	}
>  
> +	if (oacontrol) {
> +		/* Disable performance counters */
> +		intel_register_write(OACONTROL, 0);
> +
> +		/* Forcewake */
> +		intel_register_access_fini();
> +	}
> +
>  	free(totals);
>  	free(last_counter);
>  
> -- 
> 1.8.2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-03-27 11:26 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-27  5:06 [PATCH 1/3] intel_perf_counters: a little tool for dumping performance counters Kenneth Graunke
2013-03-27  5:06 ` [PATCH 2/3] intel_perf_counters: Abstract out Ironlake-specific code Kenneth Graunke
2013-03-27  5:06 ` [PATCH 3/3] intel_perf_counters: Add support for Sandybridge Kenneth Graunke
2013-03-27 11:29   ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.