All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-28 18:29 ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-28 18:29 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
---
 tools/Makefile.am     |   2 +
 tools/intel_gpu_top.c | 982 +++++++++++++++++++++-----------------------------
 tools/meson.build     |   6 +-
 3 files changed, 413 insertions(+), 577 deletions(-)

diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..4eef634eb436 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -18,701 +17,532 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
 
-static unsigned long
-gettime(void)
-{
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	int rapl_fd;
+	double rapl_scale;
 
-static int
-top_bits_sort(const void *a, const void *b)
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+
+	struct engine engine;
+};
+
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	char buf[128], *p;
+	int fd, ret;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
-}
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
-	else
-		printf("%s clock: %d Mhz", name, clock);
+	return strtoul(p, NULL, 0);
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+#define engine_ptr(engines, n) \
+	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
+
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 320;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+	if (ret)
+		free(engines);
+	else
+		engines->root = d;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	return ret == 0 ? engines : NULL;
+}
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
-			break;
-		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
-			break;
-		case 4:
-			display_clock = 333;
-			break;
-		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	}
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
 
+	buf[ret] = '\0';
 
-	printf("\n");
-	return -1;
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static uint64_t rapl_type_id(void)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
-	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
-	}
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
 }
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+#define __open_pmu(engines, pmu, idx) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
+	if (fd__ >= 0) { \
+		if ((engines)->fd == -1) \
+			(engines)->fd = fd__; \
+		(pmu)->idx = (idx)++; \
+		(engines)->num_counters++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
 {
-	return INREG(ring->mmio + reg);
+	unsigned int idx = 0;
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_req, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_act, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = __open_pmu(engines, &engines->irq, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	fd = __open_pmu(engines, &engines->rc6, idx);
+	if (fd < 0)
+		return -1;
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = __open_pmu(engines, cnt->pmu, idx);
+			if (fd < 0)
+				return -1;
+		}
+	}
+
+	engines->rapl_scale = rapl_gpu_power_scale();
+	if (engines->rapl_scale != NAN)
+		engines->rapl_scale *= 1e3; /* from nano to micro */
+	engines->rapl.config = rapl_gpu_power();
+	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
+	if (engines->rapl_fd < 0)
+		return -1;
+
+	return 0;
 }
 
-static void ring_init(struct ring *ring)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	uint64_t buf[2 + num];
+	unsigned int i;
+
+	assert(read(fd, buf, sizeof(buf)) == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_reset(struct ring *ring)
+static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->idle = ring->full = 0;
+	double pct;
+
+	pct = p->cur - p->prev;
+	pct /= d;
+	pct /= t;
+	pct *= s;
+
+	if (s == 100.0 && pct > 100.0)
+		pct = 100.0;
+
+	return pct;
 }
 
-static void ring_sample(struct ring *ring)
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
 {
-	int full;
+	uint64_t data[2];
 
-	if (!ring->size)
-		return;
+	assert(read(fd, data, sizeof(data)) == sizeof(data));
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	if (ts)
+		*ts = data[1];
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	return data[0];
+}
+
+static uint64_t pmu_read_single(int fd)
+{
+	return __pmu_read_single(fd, NULL);
+}
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
+{
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	__update_sample(counter, val[counter->idx]);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void pmu_sample(struct engines *engines)
 {
-	int percent_busy, len;
+	const int num_val = engines->num_counters;
+	uint64_t val[num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
 
-	if (!ring->size)
-		return;
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	struct winsize ws;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
+	/* Get terminal size. */
+	if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+		con_w = ws.ws_col;
+		con_h = ws.ws_row;
+	}
 
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr, "Failed to detect engines!\n");
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr, "Failed to initialize PMU!\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t, freq[2], irq, rc6, power;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		usleep(period_us);
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
+		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
+		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
+		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
+		power = pmu_calc(&engines->rapl.val, 1.0, t,
+				 engines->rapl_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
+		       freq[0], freq[1], rc6, power, irq);
+		lines++;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		printf("\n");
+		lines++;
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			double val[2];
+			char buf[128];
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
+			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf),
+				       "%6.2f%% wait, %6.2f%% sema",
+				       val[0], val[1]);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
+			len += printf("%8s %6.2f%% ",
+				      engine->name, val[0]);
+			print_percentage_bar(val[0], max_w - len);
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
-		}
+			printf("%s\n", buf);
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
+			lines++;
 		}
-	}
 
-	fclose(output);
+		printf("\n");
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Intel-gfx] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-28 18:29 ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-28 18:29 UTC (permalink / raw)
  To: igt-dev; +Cc: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
---
 tools/Makefile.am     |   2 +
 tools/intel_gpu_top.c | 982 +++++++++++++++++++++-----------------------------
 tools/meson.build     |   6 +-
 3 files changed, 413 insertions(+), 577 deletions(-)

diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..4eef634eb436 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -18,701 +17,532 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
 
-static unsigned long
-gettime(void)
-{
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	int rapl_fd;
+	double rapl_scale;
 
-static int
-top_bits_sort(const void *a, const void *b)
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+
+	struct engine engine;
+};
+
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	char buf[128], *p;
+	int fd, ret;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
-}
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
-	else
-		printf("%s clock: %d Mhz", name, clock);
+	return strtoul(p, NULL, 0);
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+#define engine_ptr(engines, n) \
+	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
+
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 320;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+	if (ret)
+		free(engines);
+	else
+		engines->root = d;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	return ret == 0 ? engines : NULL;
+}
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
-			break;
-		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
-			break;
-		case 4:
-			display_clock = 333;
-			break;
-		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	}
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
 
+	buf[ret] = '\0';
 
-	printf("\n");
-	return -1;
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static uint64_t rapl_type_id(void)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
-	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
-	}
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
 }
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+#define __open_pmu(engines, pmu, idx) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
+	if (fd__ >= 0) { \
+		if ((engines)->fd == -1) \
+			(engines)->fd = fd__; \
+		(pmu)->idx = (idx)++; \
+		(engines)->num_counters++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
 {
-	return INREG(ring->mmio + reg);
+	unsigned int idx = 0;
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_req, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_act, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = __open_pmu(engines, &engines->irq, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	fd = __open_pmu(engines, &engines->rc6, idx);
+	if (fd < 0)
+		return -1;
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = __open_pmu(engines, cnt->pmu, idx);
+			if (fd < 0)
+				return -1;
+		}
+	}
+
+	engines->rapl_scale = rapl_gpu_power_scale();
+	if (engines->rapl_scale != NAN)
+		engines->rapl_scale *= 1e3; /* from nano to micro */
+	engines->rapl.config = rapl_gpu_power();
+	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
+	if (engines->rapl_fd < 0)
+		return -1;
+
+	return 0;
 }
 
-static void ring_init(struct ring *ring)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	uint64_t buf[2 + num];
+	unsigned int i;
+
+	assert(read(fd, buf, sizeof(buf)) == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_reset(struct ring *ring)
+static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->idle = ring->full = 0;
+	double pct;
+
+	pct = p->cur - p->prev;
+	pct /= d;
+	pct /= t;
+	pct *= s;
+
+	if (s == 100.0 && pct > 100.0)
+		pct = 100.0;
+
+	return pct;
 }
 
-static void ring_sample(struct ring *ring)
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
 {
-	int full;
+	uint64_t data[2];
 
-	if (!ring->size)
-		return;
+	assert(read(fd, data, sizeof(data)) == sizeof(data));
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	if (ts)
+		*ts = data[1];
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	return data[0];
+}
+
+static uint64_t pmu_read_single(int fd)
+{
+	return __pmu_read_single(fd, NULL);
+}
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
+{
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	__update_sample(counter, val[counter->idx]);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void pmu_sample(struct engines *engines)
 {
-	int percent_busy, len;
+	const int num_val = engines->num_counters;
+	uint64_t val[num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
 
-	if (!ring->size)
-		return;
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	struct winsize ws;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
+	/* Get terminal size. */
+	if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+		con_w = ws.ws_col;
+		con_h = ws.ws_row;
+	}
 
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr, "Failed to detect engines!\n");
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr, "Failed to initialize PMU!\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t, freq[2], irq, rc6, power;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		usleep(period_us);
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
+		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
+		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
+		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
+		power = pmu_calc(&engines->rapl.val, 1.0, t,
+				 engines->rapl_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
+		       freq[0], freq[1], rc6, power, irq);
+		lines++;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		printf("\n");
+		lines++;
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			double val[2];
+			char buf[128];
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
+			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf),
+				       "%6.2f%% wait, %6.2f%% sema",
+				       val[0], val[1]);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
+			len += printf("%8s %6.2f%% ",
+				      engine->name, val[0]);
+			print_percentage_bar(val[0], max_w - len);
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
-		}
+			printf("%s\n", buf);
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
+			lines++;
 		}
-	}
 
-	fclose(output);
+		printf("\n");
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-28 18:35   ` Chris Wilson
  -1 siblings, 0 replies; 57+ messages in thread
From: Chris Wilson @ 2018-03-28 18:35 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Quoting Tvrtko Ursulin (2018-03-28 19:29:48)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>

Looked good to me.

Someone might complain about the loss of "stats" mode, but we can point
them towards "perf record" now.

Half-reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-28 18:35   ` Chris Wilson
  0 siblings, 0 replies; 57+ messages in thread
From: Chris Wilson @ 2018-03-28 18:35 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

Quoting Tvrtko Ursulin (2018-03-28 19:29:48)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>

Looked good to me.

Someone might complain about the loss of "stats" mode, but we can point
them towards "perf record" now.

Half-reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-28 18:56   ` Lionel Landwerlin
  -1 siblings, 0 replies; 57+ messages in thread
From: Lionel Landwerlin @ 2018-03-28 18:56 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 833 bytes --]

On 28/03/18 19:29, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin<tvrtko.ursulin@intel.com>
>
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
>
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
>
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
>
> Signed-off-by: Tvrtko Ursulin<tvrtko.ursulin@intel.com>
> Cc: Chris Wilson<chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin<lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala<petri.latvala@intel.com>

Looks good to me too. Sorry, this isn't a detailed review, but since 
it's a lot safer :

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

[-- Attachment #1.2: Type: text/html, Size: 1819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-28 18:56   ` Lionel Landwerlin
  0 siblings, 0 replies; 57+ messages in thread
From: Lionel Landwerlin @ 2018-03-28 18:56 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin


[-- Attachment #1.1: Type: text/plain, Size: 833 bytes --]

On 28/03/18 19:29, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin<tvrtko.ursulin@intel.com>
>
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
>
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
>
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
>
> Signed-off-by: Tvrtko Ursulin<tvrtko.ursulin@intel.com>
> Cc: Chris Wilson<chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin<lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala<petri.latvala@intel.com>

Looks good to me too. Sorry, this isn't a detailed review, but since 
it's a lot safer :

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

[-- Attachment #1.2: Type: text/html, Size: 1819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (2 preceding siblings ...)
  (?)
@ 2018-03-28 20:11 ` Rinat Ibragimov
  2018-03-29 10:49     ` [Intel-gfx] " Tvrtko Ursulin
  -1 siblings, 1 reply; 57+ messages in thread
From: Rinat Ibragimov @ 2018-03-28 20:11 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev, Intel-gfx


>Среда, 28 марта 2018, 21:30 +03:00 от Tvrtko Ursulin <tursulin@ursulin.net>:
>

>+static struct engines *discover_engines(void)
> {
>-uint32_t devid = pci_dev->device_id;
>-uint16_t gcfgc;
>+const char *sysfs_root = "/sys/devices/i915/events";

Just a question.
I think, I have Linux 4.15.11 (from Debian testing) now. And there are no such files.
Are there any estimates about when this feature is expected to be available?

>-static void ring_init(struct ring *ring)
>+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
> {
>-ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
>+uint64_t buf[2 + num];
>+unsigned int i;
>+
>+assert(read(fd, buf, sizeof(buf)) == sizeof(buf));

Will have undesired effects with NDEBUG.

>-int full;
>+uint64_t data[2];
> 
>-if (!ring->size)
>-return;
>+assert(read(fd, data, sizeof(data)) == sizeof(data));

Same here.

>+/* Get terminal size. */
>+if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>+con_w = ws.ws_col;
>+con_h = ws.ws_row;
>+}

If you move this into the loop itself, the tool will adapt to changes in
terminal width and height dynamically.

>-- 
>2.14.1
>
>_______________________________________________
>igt-dev mailing list
>igt-dev@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/igt-dev

---
Rinat
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (3 preceding siblings ...)
  (?)
@ 2018-03-29  0:40 ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-03-29  0:40 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use
URL   : https://patchwork.freedesktop.org/series/40826/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
2cbd4ddf11b3eaf01f33d8bc2ad46411ec6c299a lib/igt_kms: Improve connector probing in igt_display_init(), v2.

with latest DRM-Tip kernel build CI_DRM_4005
4668e88d6607 drm-tip: 2018y-03m-28d-20h-45m-29s UTC integration manifest

No testlist changes.

---- Possible new issues:

Test kms_pipe_crc_basic:
        Subgroup nonblocking-crc-pipe-a-frame-sequence:
                fail       -> PASS       (fi-cfl-s3)
        Subgroup read-crc-pipe-c-frame-sequence:
                fail       -> PASS       (fi-cfl-s3)

---- Known issues:

Test kms_flip:
        Subgroup basic-flip-vs-wf_vblank:
                fail       -> PASS       (fi-cfl-s3) fdo#100368

fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:431s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:442s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:381s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:538s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:298s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:514s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:514s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:522s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:509s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:409s
fi-cfl-s3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:563s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:510s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:586s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:425s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:323s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:541s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:410s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:422s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:469s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:436s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:478s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:464s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:511s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:666s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:441s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:532s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:504s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:510s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:430s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:449s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:584s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:406s
Blacklisted hosts:
fi-cnl-psr       total:285  pass:256  dwarn:3   dfail:0   fail:0   skip:26  time:533s
fi-glk-j4005     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:482s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1204/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-29  8:20   ` Petri Latvala
  -1 siblings, 0 replies; 57+ messages in thread
From: Petri Latvala @ 2018-03-29  8:20 UTC (permalink / raw)
  To: Eero Tamminen; +Cc: Intel-gfx, igt-dev

Eero, can you give this a try and provide some comments?



-- 
Petri Latvala



On 03/28/2018 09:29 PM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
>
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
>
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> ---
>   tools/Makefile.am     |   2 +
>   tools/intel_gpu_top.c | 982 +++++++++++++++++++++-----------------------------
>   tools/meson.build     |   6 +-
>   3 files changed, 413 insertions(+), 577 deletions(-)
>
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>   intel_aubdump_la_SOURCES = aubdump.c
>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>   
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>   bin_SCRIPTS = intel_aubdump
>   CLEANFILES = $(bin_SCRIPTS)
>   
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..4eef634eb436 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>   /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2018 Intel Corporation
>    *
>    * Permission is hereby granted, free of charge, to any person obtaining a
>    * copy of this software and associated documentation files (the "Software"),
> @@ -18,701 +17,532 @@
>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - *
> - * Authors:
> - *    Eric Anholt <eric@anholt.net>
> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
>    */
>   
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>   #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>   #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE	    0xA18C
> -#define  FORCEWAKE_ACK	    0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)		IS_965(devid)
> -
> -struct top_bit {
> -	struct instdone_bit *bit;
> -	int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -	" ",
> -	"▏",
> -	"▎",
> -	"▍",
> -	"▌",
> -	"▋",
> -	"▊",
> -	"▉",
> -	"█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>   
> -enum stats_counts {
> -	IA_VERTICES,
> -	IA_PRIMITIVES,
> -	VS_INVOCATION,
> -	GS_INVOCATION,
> -	GS_PRIMITIVES,
> -	CL_INVOCATION,
> -	CL_PRIMITIVES,
> -	PS_INVOCATION,
> -	PS_DEPTH,
> -	STATS_COUNT
> +struct pmu_pair {
> +	uint64_t cur;
> +	uint64_t prev;
>   };
>   
> -const uint32_t stats_regs[STATS_COUNT] = {
> -	IA_VERTICES_COUNT_QW,
> -	IA_PRIMITIVES_COUNT_QW,
> -	VS_INVOCATION_COUNT_QW,
> -	GS_INVOCATION_COUNT_QW,
> -	GS_PRIMITIVES_COUNT_QW,
> -	CL_INVOCATION_COUNT_QW,
> -	CL_PRIMITIVES_COUNT_QW,
> -	PS_INVOCATION_COUNT_QW,
> -	PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +	uint64_t config;
> +	unsigned int idx;
> +	struct pmu_pair val;
>   };
>   
> -const char *stats_reg_names[STATS_COUNT] = {
> -	"vert fetch",
> -	"prim fetch",
> -	"VS invocations",
> -	"GS invocations",
> -	"GS prims",
> -	"CL invocations",
> -	"CL prims",
> -	"PS invocations",
> -	"PS depth pass",
> +struct engine {
> +	const char *name;
> +	struct pmu_counter busy;
> +	struct pmu_counter wait;
> +	struct pmu_counter sema;
>   };
>   
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +	unsigned int num_engines;
> +	unsigned int num_counters;
> +	DIR *root;
> +	int fd;
> +	struct pmu_pair ts;
>   
> -static unsigned long
> -gettime(void)
> -{
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +	int rapl_fd;
> +	double rapl_scale;
>   
> -static int
> -top_bits_sort(const void *a, const void *b)
> +	struct pmu_counter freq_req;
> +	struct pmu_counter freq_act;
> +	struct pmu_counter irq;
> +	struct pmu_counter rc6;
> +	struct pmu_counter rapl;
> +
> +	struct engine engine;
> +};
> +
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>   {
> -	struct top_bit * const *bit_a = a;
> -	struct top_bit * const *bit_b = b;
> -	int a_count = (*bit_a)->count;
> -	int b_count = (*bit_b)->count;
> +	char buf[128], *p;
> +	int fd, ret;
>   
> -	if (a_count < b_count)
> -		return 1;
> -	else if (a_count == b_count)
> -		return 0;
> -	else
> +	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +	if (ret < 0 || ret == sizeof(buf))
>   		return -1;
> -}
>   
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -	uint32_t reg_val;
> +	fd = openat(dirfd, buf, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -	if (top_bit->bit->reg == INSTDONE_1)
> -		reg_val = instdone1;
> -	else
> -		reg_val = instdone;
> +	ret = read(fd, buf, sizeof(buf));
> +	close(fd);
> +	if (ret <= 0)
> +		return -1;
>   
> -	if ((reg_val & top_bit->bit->bit) == 0)
> -		top_bit->count++;
> -}
> +	p = index(buf, '0');
> +	if (!p)
> +		return -1;
>   
> -static void
> -print_clock(const char *name, int clock) {
> -	if (clock == -1)
> -		printf("%s clock: unknown", name);
> -	else
> -		printf("%s clock: %d Mhz", name, clock);
> +	return strtoul(p, NULL, 0);
>   }
>   
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +#define engine_ptr(engines, n) \
> +	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
> +
> +static struct engines *discover_engines(void)
>   {
> -	uint32_t devid = pci_dev->device_id;
> -	uint16_t gcfgc;
> +	const char *sysfs_root = "/sys/devices/i915/events";
> +	struct engines *engines;
> +	struct dirent *dent;
> +	int ret = 0;
> +	DIR *d;
>   
> -	if (IS_GM45(devid)) {
> -		int core_clock = -1;
> +	engines = malloc(sizeof(struct engines));
> +	if (!engines)
> +		return NULL;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	memset(engines, 0, sizeof(*engines));
>   
> -		switch (gcfgc & 0xf) {
> -		case 8:
> -			core_clock = 266;
> -			break;
> -		case 9:
> -			core_clock = 320;
> -			break;
> -		case 11:
> -			core_clock = 400;
> -			break;
> -		case 13:
> -			core_clock = 533;
> -			break;
> -		}
> -		print_clock("core", core_clock);
> -	} else if (IS_965(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, sampler_clock = -1;
> +	engines->num_engines = 0;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	d = opendir(sysfs_root);
> +	if (!d)
> +		return NULL;
>   
> -		switch (gcfgc & 0xf) {
> -		case 2:
> -			render_clock = 250; sampler_clock = 267;
> -			break;
> -		case 3:
> -			render_clock = 320; sampler_clock = 333;
> -			break;
> -		case 4:
> -			render_clock = 400; sampler_clock = 444;
> -			break;
> -		case 5:
> -			render_clock = 500; sampler_clock = 533;
> +	while ((dent = readdir(d)) != NULL) {
> +		const char *endswith = "-busy";
> +		const unsigned int endlen = strlen(endswith);
> +		struct engine *engine =
> +				engine_ptr(engines, engines->num_engines);
> +		char buf[256];
> +
> +		if (dent->d_type != DT_REG)
> +			continue;
> +
> +		if (strlen(dent->d_name) >= sizeof(buf)) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("sampler", sampler_clock);
> -	} else if (IS_945(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		strcpy(buf, dent->d_name);
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		/* xxxN-busy */
> +		if (strlen(buf) < (endlen + 4))
> +			continue;
> +		if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +			continue;
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 166;
> -			break;
> -		case 1:
> -			render_clock = 200;
> -			break;
> -		case 3:
> -			render_clock = 250;
> -			break;
> -		case 5:
> -			render_clock = 400;
> +		memset(engine, 0, sizeof(*engine));
> +
> +		buf[strlen(buf) - endlen] = 0;
> +		engine->name = strdup(buf);
> +		if (!engine->name) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 200;
> +		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +						     "busy");
> +		if (engine->busy.config == -1) {
> +			ret = -1;
>   			break;
> -		case 4:
> -			display_clock = 320;
> +		}
> +
> +		engines->num_engines++;
> +		engines = realloc(engines, sizeof(struct engines) +
> +				  engines->num_engines * sizeof(struct engine));
> +		if (!engines) {
> +			ret = -ENOMEM;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	} else if (IS_915(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +	if (ret)
> +		free(engines);
> +	else
> +		engines->root = d;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	return ret == 0 ? engines : NULL;
> +}
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 160;
> -			break;
> -		case 1:
> -			render_clock = 190;
> -			break;
> -		case 4:
> -			render_clock = 333;
> -			break;
> -		}
> -		if (gcfgc & (1 << 13))
> -		    render_clock = 133;
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +	int fd;
> +	ssize_t ret;
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 190;
> -			break;
> -		case 4:
> -			display_clock = 333;
> -			break;
> -		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	fd = open(filename, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	}
> +	ret = read(fd, buf, bufsize - 1);
> +	close(fd);
> +	if (ret < 1)
> +		return -1;
>   
> +	buf[ret] = '\0';
>   
> -	printf("\n");
> -	return -1;
> +	return 0;
>   }
>   
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
> +static uint64_t filename_to_u64(const char *filename, int base)
> +{
> +	char buf[64], *b;
>   
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	/*
> +	 * Handle both single integer and key=value formats by skipping
> +	 * leading non-digits.
> +	 */
> +	b = buf;
> +	while (*b && !isdigit(*b))
> +		b++;
> +
> +	return strtoull(b, NULL, base);
> +}
> +
> +static uint64_t rapl_type_id(void)
>   {
> -	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -	int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -	int i;
> +	return filename_to_u64("/sys/devices/power/type", 10);
> +}
>   
> -	for (i = bar_len; i >= 8; i -= 8) {
> -		printf("%s", bars[8]);
> -		cur_line_len++;
> -	}
> -	if (i) {
> -		printf("%s", bars[i]);
> -		cur_line_len++;
> -	}
> +static uint64_t rapl_gpu_power(void)
> +{
> +	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +	char *oldlocale;
> +	char buf[80];
> +	double v;
> +
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
>   
> -	/* NB: We can't use a field width with utf8 so we manually
> -	* guarantee a field with of 45 chars for any bar. */
> -	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> +	oldlocale = setlocale(LC_ALL, "C");
> +	v = strtod(buf, NULL);
> +	setlocale(LC_ALL, oldlocale);
> +
> +	return v;
>   }
>   
> -struct ring {
> -	const char *name;
> -	uint32_t mmio;
> -	int head, tail, size;
> -	uint64_t full;
> -	int idle;
> -};
> +static double rapl_gpu_power_scale(void)
> +{
> +	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
> +}
>   
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +#define __open_pmu(engines, pmu, idx) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
> +	if (fd__ >= 0) { \
> +		if ((engines)->fd == -1) \
> +			(engines)->fd = fd__; \
> +		(pmu)->idx = (idx)++; \
> +		(engines)->num_counters++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
>   {
> -	return INREG(ring->mmio + reg);
> +	unsigned int idx = 0;
> +	unsigned int i;
> +	int fd;
> +
> +	engines->fd = -1;
> +	engines->num_counters = 0;
> +
> +	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_req, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_act, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->irq.config = I915_PMU_INTERRUPTS;
> +	fd = __open_pmu(engines, &engines->irq, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +	fd = __open_pmu(engines, &engines->rc6, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +		struct {
> +			struct pmu_counter *pmu;
> +			const char *counter;
> +		} *cnt, counters[] = {
> +			{ .pmu = &engine->busy, .counter = "busy" },
> +			{ .pmu = &engine->wait, .counter = "wait" },
> +			{ .pmu = &engine->sema, .counter = "sema" },
> +			{ .pmu = NULL, .counter = NULL },
> +		};
> +
> +		for (cnt = counters; cnt->pmu; cnt++) {
> +			if (!cnt->pmu->config)
> +				cnt->pmu->config =
> +					get_pmu_config(dirfd(engines->root),
> +						       engine->name,
> +						       cnt->counter);
> +			fd = __open_pmu(engines, cnt->pmu, idx);
> +			if (fd < 0)
> +				return -1;
> +		}
> +	}
> +
> +	engines->rapl_scale = rapl_gpu_power_scale();
> +	if (engines->rapl_scale != NAN)
> +		engines->rapl_scale *= 1e3; /* from nano to micro */
> +	engines->rapl.config = rapl_gpu_power();
> +	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
> +	if (engines->rapl_fd < 0)
> +		return -1;
> +
> +	return 0;
>   }
>   
> -static void ring_init(struct ring *ring)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>   {
> -	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +	uint64_t buf[2 + num];
> +	unsigned int i;
> +
> +	assert(read(fd, buf, sizeof(buf)) == sizeof(buf));
> +
> +	for (i = 0; i < num; i++)
> +		val[i] = buf[2 + i];
> +
> +	return buf[1];
>   }
>   
> -static void ring_reset(struct ring *ring)
> +static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
>   {
> -	ring->idle = ring->full = 0;
> +	double pct;
> +
> +	pct = p->cur - p->prev;
> +	pct /= d;
> +	pct /= t;
> +	pct *= s;
> +
> +	if (s == 100.0 && pct > 100.0)
> +		pct = 100.0;
> +
> +	return pct;
>   }
>   
> -static void ring_sample(struct ring *ring)
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>   {
> -	int full;
> +	uint64_t data[2];
>   
> -	if (!ring->size)
> -		return;
> +	assert(read(fd, data, sizeof(data)) == sizeof(data));
>   
> -	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +	if (ts)
> +		*ts = data[1];
>   
> -	if (ring->tail == ring->head)
> -		ring->idle++;
> +	return data[0];
> +}
> +
> +static uint64_t pmu_read_single(int fd)
> +{
> +	return __pmu_read_single(fd, NULL);
> +}
>   
> -	full = ring->tail - ring->head;
> -	if (full < 0)
> -		full += ring->size;
> -	ring->full += full;
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
> +{
> +	counter->val.prev = counter->val.cur;
> +	counter->val.cur = val;
>   }
>   
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>   {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +	__update_sample(counter, val[counter->idx]);
>   }
>   
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void pmu_sample(struct engines *engines)
>   {
> -	int percent_busy, len;
> +	const int num_val = engines->num_counters;
> +	uint64_t val[num_val];
> +	unsigned int i;
> +
> +	engines->ts.prev = engines->ts.cur;
> +	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
> +
> +	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>   
> -	if (!ring->size)
> -		return;
> +	update_sample(&engines->freq_req, val);
> +	update_sample(&engines->freq_act, val);
> +	update_sample(&engines->irq, val);
> +	update_sample(&engines->rc6, val);
>   
> -	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
>   
> -	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -	print_percentage_bar (percent_busy, len);
> -	printf("%24s space: %d/%d\n",
> -		   ring->name,
> -		   (int)(ring->full / samples_per_sec),
> -		   ring->size);
> +		update_sample(&engine->busy, val);
> +		update_sample(&engine->sema, val);
> +		update_sample(&engine->wait, val);
> +	}
>   }
>   
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -		FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>   {
> -	if (ring->size)
> -		fprintf(output, "%3d\t%d\t",
> -			(int)(100 - 100 * ring->idle / samples_per_sec),
> -			(int)(ring->full / samples_per_sec));
> -	else
> -		fprintf(output, "-1\t-1\t");
> +	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +	int i;
> +
> +	putchar('|');
> +
> +	for (i = bar_len; i >= 8; i -= 8)
> +		printf("%s", bars[8]);
> +	if (i)
> +		printf("%s", bars[i]);
> +
> +	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +		putchar(' ');
> +
> +	putchar('|');
>   }
>   
> +#define DEFAULT_PERIOD_MS (1000)
> +
>   static void
>   usage(const char *appname)
>   {
>   	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -			"\n"
> -			"usage: %s [parameters]\n"
> -			"\n"
> -			"The following parameters apply:\n"
> -			"[-s <samples>]       samples per seconds (default %d)\n"
> -			"[-e <command>]       command to profile\n"
> -			"[-o <file>]          output statistics to file. If file is '-',"
> -			"                     run in batch mode and output statistics to stdio only \n"
> -			"[-h]                 show this help screen\n"
> -			"\n",
> -			appname,
> -			SAMPLES_PER_SEC
> -		  );
> -	return;
> +		"\n"
> +		"Usage: %s [parameters]\n"
> +		"\n"
> +		"\tThe following parameters are optional:\n"
> +		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
> +		"\t[-h]                 show this help text\n"
> +		"\n",
> +		appname, DEFAULT_PERIOD_MS);
>   }
>   
>   int main(int argc, char **argv)
>   {
> -	uint32_t devid;
> -	struct pci_device *pci_dev;
> -	struct ring render_ring = {
> -		.name = "render",
> -		.mmio = 0x2030,
> -	}, bsd_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x4030,
> -	}, bsd6_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x12030,
> -	}, blt_ring = {
> -		.name = "blitter",
> -		.mmio = 0x22030,
> -	};
> -	int i, ch;
> -	int samples_per_sec = SAMPLES_PER_SEC;
> -	FILE *output = NULL;
> -	double elapsed_time=0;
> -	int print_headers=1;
> -	pid_t child_pid=-1;
> -	int child_stat;
> -	char *cmd=NULL;
> -	int interactive=1;
> -
> -	/* Parse options? */
> -	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +	int con_w = -1, con_h = -1;
> +	struct engines *engines;
> +	struct winsize ws;
> +	unsigned int i;
> +	int ret, ch;
> +
> +	/* Parse options */
> +	while ((ch = getopt(argc, argv, "s:h")) != -1) {
>   		switch (ch) {
> -		case 'e': cmd = strdup(optarg);
> -			break;
> -		case 's': samples_per_sec = atoi(optarg);
> -			if (samples_per_sec < 100) {
> -				fprintf(stderr, "Error: samples per second must be >= 100\n");
> -				exit(1);
> -			}
> -			break;
> -		case 'o':
> -			if (!strcmp(optarg, "-")) {
> -				/* Running in non-interactive mode */
> -				interactive = 0;
> -				output = stdout;
> -			}
> -			else
> -				output = fopen(optarg, "w");
> -			if (!output)
> -			{
> -				perror("fopen");
> -				exit(1);
> -			}
> +		case 's':
> +			period_us = atoi(optarg) * 1000;
>   			break;
>   		case 'h':
>   			usage(argv[0]);
>   			exit(0);
> -			break;
>   		default:
> -			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>   			usage(argv[0]);
>   			exit(1);
> -			break;
>   		}
>   	}
>   
> -	pci_dev = intel_get_pci_device();
> -	devid = pci_dev->device_id;
> -	intel_mmio_use_pci_bar(pci_dev);
> -	init_instdone_definitions(devid);
> +	/* Get terminal size. */
> +	if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +		con_w = ws.ws_col;
> +		con_h = ws.ws_row;
> +	}
>   
> -	/* Do we have a command to run? */
> -	if (cmd != NULL) {
> -		if (output) {
> -			fprintf(output, "# Profiling: %s\n", cmd);
> -			fflush(output);
> -		}
> -		child_pid = fork();
> -		if (child_pid < 0) {
> -			perror("fork");
> -			exit(1);
> -		}
> -		else if (child_pid == 0) {
> -			int res;
> -			res = system(cmd);
> -			if (res < 0)
> -				perror("running command");
> -			if (output) {
> -				fflush(output);
> -				fprintf(output, "# %s exited with status %d\n", cmd, res);
> -				fflush(output);
> -			}
> -			free(cmd);
> -			exit(0);
> -		} else {
> -			free(cmd);
> -		}
> +	engines = discover_engines();
> +	if (!engines) {
> +		fprintf(stderr, "Failed to detect engines!\n");
> +		return 1;
>   	}
>   
> -	for (i = 0; i < num_instdone_bits; i++) {
> -		top_bits[i].bit = &instdone_bits[i];
> -		top_bits[i].count = 0;
> -		top_bits_sorted[i] = &top_bits[i];
> +	ret = pmu_init(engines);
> +	if (ret) {
> +		fprintf(stderr, "Failed to initialize PMU!\n");
> +		return 1;
>   	}
>   
> -	/* Grab access to the registers */
> -	intel_register_access_init(pci_dev, 0, -1);
> +	pmu_sample(engines);
>   
> -	ring_init(&render_ring);
> -	if (IS_GEN4(devid) || IS_GEN5(devid))
> -		ring_init(&bsd_ring);
> -	if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -		ring_init(&bsd6_ring);
> -		ring_init(&blt_ring);
> -	}
> +	for (;;) {
> +		double t, freq[2], irq, rc6, power;
> +		int lines = 0;
>   
> -	/* Initialize GPU stats */
> -	if (HAS_STATS_REGS(devid)) {
> -		for (i = 0; i < STATS_COUNT; i++) {
> -			uint32_t stats_high, stats_low, stats_high_2;
> +		usleep(period_us);
>   
> -			do {
> -				stats_high = INREG(stats_regs[i] + 4);
> -				stats_low = INREG(stats_regs[i]);
> -				stats_high_2 = INREG(stats_regs[i] + 4);
> -			} while (stats_high != stats_high_2);
> +		pmu_sample(engines);
> +		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>   
> -			last_stats[i] = (uint64_t)stats_high << 32 |
> -				stats_low;
> -		}
> -	}
> +		printf("\033[H\033[J");
>   
> -	for (;;) {
> -		int j;
> -		unsigned long long t1, ti, tf, t2;
> -		unsigned long long def_sleep = 1000000 / samples_per_sec;
> -		unsigned long long last_samples_per_sec = samples_per_sec;
> -		unsigned short int max_lines;
> -		struct winsize ws;
> -		char clear_screen[] = {0x1b, '[', 'H',
> -				       0x1b, '[', 'J',
> -				       0x0};
> -		int percent;
> -		int len;
> -
> -		t1 = gettime();
> -
> -		ring_reset(&render_ring);
> -		ring_reset(&bsd_ring);
> -		ring_reset(&bsd6_ring);
> -		ring_reset(&blt_ring);
> -
> -		for (i = 0; i < samples_per_sec; i++) {
> -			long long interval;
> -			ti = gettime();
> -			if (IS_965(devid)) {
> -				instdone = INREG(INSTDONE_I965);
> -				instdone1 = INREG(INSTDONE_1);
> -			} else
> -				instdone = INREG(INSTDONE);
> -
> -			for (j = 0; j < num_instdone_bits; j++)
> -				update_idle_bit(&top_bits[j]);
> -
> -			ring_sample(&render_ring);
> -			ring_sample(&bsd_ring);
> -			ring_sample(&bsd6_ring);
> -			ring_sample(&blt_ring);
> -
> -			tf = gettime();
> -			if (tf - t1 >= 1000000) {
> -				/* We are out of sync, bail out */
> -				last_samples_per_sec = i+1;
> -				break;
> -			}
> -			interval = def_sleep - (tf - ti);
> -			if (interval > 0)
> -				usleep(interval);
> -		}
> +		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
> +		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
> +		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
> +		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
> +		power = pmu_calc(&engines->rapl.val, 1.0, t,
> +				 engines->rapl_scale);
>   
> -		if (HAS_STATS_REGS(devid)) {
> -			for (i = 0; i < STATS_COUNT; i++) {
> -				uint32_t stats_high, stats_low, stats_high_2;
> +		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
> +		       freq[0], freq[1], rc6, power, irq);
> +		lines++;
>   
> -				do {
> -					stats_high = INREG(stats_regs[i] + 4);
> -					stats_low = INREG(stats_regs[i]);
> -					stats_high_2 = INREG(stats_regs[i] + 4);
> -				} while (stats_high != stats_high_2);
> +		printf("\n");
> +		lines++;
>   
> -				stats[i] = (uint64_t)stats_high << 32 |
> -					stats_low;
> -			}
> -		}
> +		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
> +			unsigned int max_w = con_w - 1;
> +			unsigned int len;
> +			double val[2];
> +			char buf[128];
>   
> -		qsort(top_bits_sorted, num_instdone_bits,
> -		      sizeof(struct top_bit *), top_bits_sort);
> -
> -		/* Limit the number of lines printed to the terminal height so the
> -		 * most important info (at the top) will stay on screen. */
> -		max_lines = -1;
> -		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -			max_lines = ws.ws_row - 6; /* exclude header lines */
> -		if (max_lines >= num_instdone_bits)
> -			max_lines = num_instdone_bits;
> -
> -		t2 = gettime();
> -		elapsed_time += (t2 - t1) / 1000000.0;
> -
> -		if (interactive) {
> -			printf("%s", clear_screen);
> -			print_clock_info(pci_dev);
> -
> -			ring_print(&render_ring, last_samples_per_sec);
> -			ring_print(&bsd_ring, last_samples_per_sec);
> -			ring_print(&bsd6_ring, last_samples_per_sec);
> -			ring_print(&blt_ring, last_samples_per_sec);
> -
> -			printf("\n%30s  %s\n", "task", "percent busy");
> -			for (i = 0; i < max_lines; i++) {
> -				if (top_bits_sorted[i]->count > 0) {
> -					percent = (top_bits_sorted[i]->count * 100) /
> -						last_samples_per_sec;
> -					len = printf("%30s: %3d%%: ",
> -							 top_bits_sorted[i]->bit->name,
> -							 percent);
> -					print_percentage_bar (percent, len);
> -				} else {
> -					printf("%*s", PERCENTAGE_BAR_END, "");
> -				}
> -
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					printf("%13s: %llu (%lld/sec)",
> -						   stats_reg_names[i],
> -						   (long long)stats[i],
> -						   (long long)(stats[i] - last_stats[i]));
> -					last_stats[i] = stats[i];
> -				} else {
> -					if (!top_bits_sorted[i]->count)
> -						break;
> -				}
> -				printf("\n");
> -			}
> -		}
> -		if (output) {
> -			/* Print headers for columns at first run */
> -			if (print_headers) {
> -				fprintf(output, "# time\t");
> -				ring_print_header(output, &render_ring);
> -				ring_print_header(output, &bsd_ring);
> -				ring_print_header(output, &bsd6_ring);
> -				ring_print_header(output, &blt_ring);
> -				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -						fprintf(output, "%.6s\t",
> -							   stats_reg_names[i]
> -							   );
> -					}
> -					if (!top_bits[i].count)
> -						continue;
> -				}
> -				fprintf(output, "\n");
> -				print_headers = 0;
> -			}
> -
> -			/* Print statistics */
> -			fprintf(output, "%.2f\t", elapsed_time);
> -			ring_log(&render_ring, last_samples_per_sec, output);
> -			ring_log(&bsd_ring, last_samples_per_sec, output);
> -			ring_log(&bsd6_ring, last_samples_per_sec, output);
> -			ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					fprintf(output, "%"PRIu64"\t",
> -						   stats[i] - last_stats[i]);
> -					last_stats[i] = stats[i];
> -				}
> -					if (!top_bits[i].count)
> -						continue;
> -			}
> -			fprintf(output, "\n");
> -			fflush(output);
> -		}
> +			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
> +			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
> +			len = snprintf(buf, sizeof(buf),
> +				       "%6.2f%% wait, %6.2f%% sema",
> +				       val[0], val[1]);
>   
> -		for (i = 0; i < num_instdone_bits; i++) {
> -			top_bits_sorted[i]->count = 0;
> +			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
> +			len += printf("%8s %6.2f%% ",
> +				      engine->name, val[0]);
> +			print_percentage_bar(val[0], max_w - len);
>   
> -			if (i < STATS_COUNT)
> -				last_stats[i] = stats[i];
> -		}
> +			printf("%s\n", buf);
>   
> -		/* Check if child has gone */
> -		if (child_pid > 0) {
> -			int res;
> -			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -				perror("waitpid");
> -				exit(1);
> -			}
> -			if (res == 0)
> -				continue;
> -			if (WIFEXITED(child_stat))
> -				break;
> +			lines++;
>   		}
> -	}
>   
> -	fclose(output);
> +		printf("\n");
> +	}
>   
> -	intel_register_access_fini();
>   	return 0;
>   }
> diff --git a/tools/meson.build b/tools/meson.build
> index bd2d313d5156..a918eeb0bef1 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -23,7 +23,6 @@ tools_progs = [
>   	'intel_gpu_frequency',
>   	'intel_firmware_decode',
>   	'intel_gpu_time',
> -	'intel_gpu_top',
>   	'intel_gtt',
>   	'intel_guc_logger',
>   	'intel_infoframes',
> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>   	       name_prefix : '',
>   	       install : true)
>   
> +executable('intel_gpu_top', 'intel_gpu_top.c',
> +	   install : true,
> +	   install_rpath : rpathdir,
> +	   dependencies : tool_deps + [ lib_igt_perf ])
> +
>   conf_data = configuration_data()
>   conf_data.set('prefix', prefix)
>   conf_data.set('exec_prefix', '${prefix}')

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Intel-gfx] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-29  8:20   ` Petri Latvala
  0 siblings, 0 replies; 57+ messages in thread
From: Petri Latvala @ 2018-03-29  8:20 UTC (permalink / raw)
  To: Eero Tamminen; +Cc: Intel-gfx, igt-dev

Eero, can you give this a try and provide some comments?



-- 
Petri Latvala



On 03/28/2018 09:29 PM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
>
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
>
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> ---
>   tools/Makefile.am     |   2 +
>   tools/intel_gpu_top.c | 982 +++++++++++++++++++++-----------------------------
>   tools/meson.build     |   6 +-
>   3 files changed, 413 insertions(+), 577 deletions(-)
>
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>   intel_aubdump_la_SOURCES = aubdump.c
>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>   
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>   bin_SCRIPTS = intel_aubdump
>   CLEANFILES = $(bin_SCRIPTS)
>   
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..4eef634eb436 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>   /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2018 Intel Corporation
>    *
>    * Permission is hereby granted, free of charge, to any person obtaining a
>    * copy of this software and associated documentation files (the "Software"),
> @@ -18,701 +17,532 @@
>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - *
> - * Authors:
> - *    Eric Anholt <eric@anholt.net>
> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
>    */
>   
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>   #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>   #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE	    0xA18C
> -#define  FORCEWAKE_ACK	    0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)		IS_965(devid)
> -
> -struct top_bit {
> -	struct instdone_bit *bit;
> -	int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -	" ",
> -	"▏",
> -	"▎",
> -	"▍",
> -	"▌",
> -	"▋",
> -	"▊",
> -	"▉",
> -	"█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>   
> -enum stats_counts {
> -	IA_VERTICES,
> -	IA_PRIMITIVES,
> -	VS_INVOCATION,
> -	GS_INVOCATION,
> -	GS_PRIMITIVES,
> -	CL_INVOCATION,
> -	CL_PRIMITIVES,
> -	PS_INVOCATION,
> -	PS_DEPTH,
> -	STATS_COUNT
> +struct pmu_pair {
> +	uint64_t cur;
> +	uint64_t prev;
>   };
>   
> -const uint32_t stats_regs[STATS_COUNT] = {
> -	IA_VERTICES_COUNT_QW,
> -	IA_PRIMITIVES_COUNT_QW,
> -	VS_INVOCATION_COUNT_QW,
> -	GS_INVOCATION_COUNT_QW,
> -	GS_PRIMITIVES_COUNT_QW,
> -	CL_INVOCATION_COUNT_QW,
> -	CL_PRIMITIVES_COUNT_QW,
> -	PS_INVOCATION_COUNT_QW,
> -	PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +	uint64_t config;
> +	unsigned int idx;
> +	struct pmu_pair val;
>   };
>   
> -const char *stats_reg_names[STATS_COUNT] = {
> -	"vert fetch",
> -	"prim fetch",
> -	"VS invocations",
> -	"GS invocations",
> -	"GS prims",
> -	"CL invocations",
> -	"CL prims",
> -	"PS invocations",
> -	"PS depth pass",
> +struct engine {
> +	const char *name;
> +	struct pmu_counter busy;
> +	struct pmu_counter wait;
> +	struct pmu_counter sema;
>   };
>   
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +	unsigned int num_engines;
> +	unsigned int num_counters;
> +	DIR *root;
> +	int fd;
> +	struct pmu_pair ts;
>   
> -static unsigned long
> -gettime(void)
> -{
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +	int rapl_fd;
> +	double rapl_scale;
>   
> -static int
> -top_bits_sort(const void *a, const void *b)
> +	struct pmu_counter freq_req;
> +	struct pmu_counter freq_act;
> +	struct pmu_counter irq;
> +	struct pmu_counter rc6;
> +	struct pmu_counter rapl;
> +
> +	struct engine engine;
> +};
> +
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>   {
> -	struct top_bit * const *bit_a = a;
> -	struct top_bit * const *bit_b = b;
> -	int a_count = (*bit_a)->count;
> -	int b_count = (*bit_b)->count;
> +	char buf[128], *p;
> +	int fd, ret;
>   
> -	if (a_count < b_count)
> -		return 1;
> -	else if (a_count == b_count)
> -		return 0;
> -	else
> +	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +	if (ret < 0 || ret == sizeof(buf))
>   		return -1;
> -}
>   
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -	uint32_t reg_val;
> +	fd = openat(dirfd, buf, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -	if (top_bit->bit->reg == INSTDONE_1)
> -		reg_val = instdone1;
> -	else
> -		reg_val = instdone;
> +	ret = read(fd, buf, sizeof(buf));
> +	close(fd);
> +	if (ret <= 0)
> +		return -1;
>   
> -	if ((reg_val & top_bit->bit->bit) == 0)
> -		top_bit->count++;
> -}
> +	p = index(buf, '0');
> +	if (!p)
> +		return -1;
>   
> -static void
> -print_clock(const char *name, int clock) {
> -	if (clock == -1)
> -		printf("%s clock: unknown", name);
> -	else
> -		printf("%s clock: %d Mhz", name, clock);
> +	return strtoul(p, NULL, 0);
>   }
>   
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +#define engine_ptr(engines, n) \
> +	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
> +
> +static struct engines *discover_engines(void)
>   {
> -	uint32_t devid = pci_dev->device_id;
> -	uint16_t gcfgc;
> +	const char *sysfs_root = "/sys/devices/i915/events";
> +	struct engines *engines;
> +	struct dirent *dent;
> +	int ret = 0;
> +	DIR *d;
>   
> -	if (IS_GM45(devid)) {
> -		int core_clock = -1;
> +	engines = malloc(sizeof(struct engines));
> +	if (!engines)
> +		return NULL;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	memset(engines, 0, sizeof(*engines));
>   
> -		switch (gcfgc & 0xf) {
> -		case 8:
> -			core_clock = 266;
> -			break;
> -		case 9:
> -			core_clock = 320;
> -			break;
> -		case 11:
> -			core_clock = 400;
> -			break;
> -		case 13:
> -			core_clock = 533;
> -			break;
> -		}
> -		print_clock("core", core_clock);
> -	} else if (IS_965(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, sampler_clock = -1;
> +	engines->num_engines = 0;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	d = opendir(sysfs_root);
> +	if (!d)
> +		return NULL;
>   
> -		switch (gcfgc & 0xf) {
> -		case 2:
> -			render_clock = 250; sampler_clock = 267;
> -			break;
> -		case 3:
> -			render_clock = 320; sampler_clock = 333;
> -			break;
> -		case 4:
> -			render_clock = 400; sampler_clock = 444;
> -			break;
> -		case 5:
> -			render_clock = 500; sampler_clock = 533;
> +	while ((dent = readdir(d)) != NULL) {
> +		const char *endswith = "-busy";
> +		const unsigned int endlen = strlen(endswith);
> +		struct engine *engine =
> +				engine_ptr(engines, engines->num_engines);
> +		char buf[256];
> +
> +		if (dent->d_type != DT_REG)
> +			continue;
> +
> +		if (strlen(dent->d_name) >= sizeof(buf)) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("sampler", sampler_clock);
> -	} else if (IS_945(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		strcpy(buf, dent->d_name);
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		/* xxxN-busy */
> +		if (strlen(buf) < (endlen + 4))
> +			continue;
> +		if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +			continue;
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 166;
> -			break;
> -		case 1:
> -			render_clock = 200;
> -			break;
> -		case 3:
> -			render_clock = 250;
> -			break;
> -		case 5:
> -			render_clock = 400;
> +		memset(engine, 0, sizeof(*engine));
> +
> +		buf[strlen(buf) - endlen] = 0;
> +		engine->name = strdup(buf);
> +		if (!engine->name) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 200;
> +		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +						     "busy");
> +		if (engine->busy.config == -1) {
> +			ret = -1;
>   			break;
> -		case 4:
> -			display_clock = 320;
> +		}
> +
> +		engines->num_engines++;
> +		engines = realloc(engines, sizeof(struct engines) +
> +				  engines->num_engines * sizeof(struct engine));
> +		if (!engines) {
> +			ret = -ENOMEM;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	} else if (IS_915(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +	if (ret)
> +		free(engines);
> +	else
> +		engines->root = d;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	return ret == 0 ? engines : NULL;
> +}
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 160;
> -			break;
> -		case 1:
> -			render_clock = 190;
> -			break;
> -		case 4:
> -			render_clock = 333;
> -			break;
> -		}
> -		if (gcfgc & (1 << 13))
> -		    render_clock = 133;
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +	int fd;
> +	ssize_t ret;
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 190;
> -			break;
> -		case 4:
> -			display_clock = 333;
> -			break;
> -		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	fd = open(filename, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	}
> +	ret = read(fd, buf, bufsize - 1);
> +	close(fd);
> +	if (ret < 1)
> +		return -1;
>   
> +	buf[ret] = '\0';
>   
> -	printf("\n");
> -	return -1;
> +	return 0;
>   }
>   
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
> +static uint64_t filename_to_u64(const char *filename, int base)
> +{
> +	char buf[64], *b;
>   
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	/*
> +	 * Handle both single integer and key=value formats by skipping
> +	 * leading non-digits.
> +	 */
> +	b = buf;
> +	while (*b && !isdigit(*b))
> +		b++;
> +
> +	return strtoull(b, NULL, base);
> +}
> +
> +static uint64_t rapl_type_id(void)
>   {
> -	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -	int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -	int i;
> +	return filename_to_u64("/sys/devices/power/type", 10);
> +}
>   
> -	for (i = bar_len; i >= 8; i -= 8) {
> -		printf("%s", bars[8]);
> -		cur_line_len++;
> -	}
> -	if (i) {
> -		printf("%s", bars[i]);
> -		cur_line_len++;
> -	}
> +static uint64_t rapl_gpu_power(void)
> +{
> +	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +	char *oldlocale;
> +	char buf[80];
> +	double v;
> +
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
>   
> -	/* NB: We can't use a field width with utf8 so we manually
> -	* guarantee a field with of 45 chars for any bar. */
> -	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> +	oldlocale = setlocale(LC_ALL, "C");
> +	v = strtod(buf, NULL);
> +	setlocale(LC_ALL, oldlocale);
> +
> +	return v;
>   }
>   
> -struct ring {
> -	const char *name;
> -	uint32_t mmio;
> -	int head, tail, size;
> -	uint64_t full;
> -	int idle;
> -};
> +static double rapl_gpu_power_scale(void)
> +{
> +	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
> +}
>   
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +#define __open_pmu(engines, pmu, idx) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
> +	if (fd__ >= 0) { \
> +		if ((engines)->fd == -1) \
> +			(engines)->fd = fd__; \
> +		(pmu)->idx = (idx)++; \
> +		(engines)->num_counters++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
>   {
> -	return INREG(ring->mmio + reg);
> +	unsigned int idx = 0;
> +	unsigned int i;
> +	int fd;
> +
> +	engines->fd = -1;
> +	engines->num_counters = 0;
> +
> +	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_req, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_act, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->irq.config = I915_PMU_INTERRUPTS;
> +	fd = __open_pmu(engines, &engines->irq, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +	fd = __open_pmu(engines, &engines->rc6, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +		struct {
> +			struct pmu_counter *pmu;
> +			const char *counter;
> +		} *cnt, counters[] = {
> +			{ .pmu = &engine->busy, .counter = "busy" },
> +			{ .pmu = &engine->wait, .counter = "wait" },
> +			{ .pmu = &engine->sema, .counter = "sema" },
> +			{ .pmu = NULL, .counter = NULL },
> +		};
> +
> +		for (cnt = counters; cnt->pmu; cnt++) {
> +			if (!cnt->pmu->config)
> +				cnt->pmu->config =
> +					get_pmu_config(dirfd(engines->root),
> +						       engine->name,
> +						       cnt->counter);
> +			fd = __open_pmu(engines, cnt->pmu, idx);
> +			if (fd < 0)
> +				return -1;
> +		}
> +	}
> +
> +	engines->rapl_scale = rapl_gpu_power_scale();
> +	if (engines->rapl_scale != NAN)
> +		engines->rapl_scale *= 1e3; /* from nano to micro */
> +	engines->rapl.config = rapl_gpu_power();
> +	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
> +	if (engines->rapl_fd < 0)
> +		return -1;
> +
> +	return 0;
>   }
>   
> -static void ring_init(struct ring *ring)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>   {
> -	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +	uint64_t buf[2 + num];
> +	unsigned int i;
> +
> +	assert(read(fd, buf, sizeof(buf)) == sizeof(buf));
> +
> +	for (i = 0; i < num; i++)
> +		val[i] = buf[2 + i];
> +
> +	return buf[1];
>   }
>   
> -static void ring_reset(struct ring *ring)
> +static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
>   {
> -	ring->idle = ring->full = 0;
> +	double pct;
> +
> +	pct = p->cur - p->prev;
> +	pct /= d;
> +	pct /= t;
> +	pct *= s;
> +
> +	if (s == 100.0 && pct > 100.0)
> +		pct = 100.0;
> +
> +	return pct;
>   }
>   
> -static void ring_sample(struct ring *ring)
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>   {
> -	int full;
> +	uint64_t data[2];
>   
> -	if (!ring->size)
> -		return;
> +	assert(read(fd, data, sizeof(data)) == sizeof(data));
>   
> -	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +	if (ts)
> +		*ts = data[1];
>   
> -	if (ring->tail == ring->head)
> -		ring->idle++;
> +	return data[0];
> +}
> +
> +static uint64_t pmu_read_single(int fd)
> +{
> +	return __pmu_read_single(fd, NULL);
> +}
>   
> -	full = ring->tail - ring->head;
> -	if (full < 0)
> -		full += ring->size;
> -	ring->full += full;
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
> +{
> +	counter->val.prev = counter->val.cur;
> +	counter->val.cur = val;
>   }
>   
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>   {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +	__update_sample(counter, val[counter->idx]);
>   }
>   
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void pmu_sample(struct engines *engines)
>   {
> -	int percent_busy, len;
> +	const int num_val = engines->num_counters;
> +	uint64_t val[num_val];
> +	unsigned int i;
> +
> +	engines->ts.prev = engines->ts.cur;
> +	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
> +
> +	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>   
> -	if (!ring->size)
> -		return;
> +	update_sample(&engines->freq_req, val);
> +	update_sample(&engines->freq_act, val);
> +	update_sample(&engines->irq, val);
> +	update_sample(&engines->rc6, val);
>   
> -	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
>   
> -	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -	print_percentage_bar (percent_busy, len);
> -	printf("%24s space: %d/%d\n",
> -		   ring->name,
> -		   (int)(ring->full / samples_per_sec),
> -		   ring->size);
> +		update_sample(&engine->busy, val);
> +		update_sample(&engine->sema, val);
> +		update_sample(&engine->wait, val);
> +	}
>   }
>   
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -		FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>   {
> -	if (ring->size)
> -		fprintf(output, "%3d\t%d\t",
> -			(int)(100 - 100 * ring->idle / samples_per_sec),
> -			(int)(ring->full / samples_per_sec));
> -	else
> -		fprintf(output, "-1\t-1\t");
> +	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +	int i;
> +
> +	putchar('|');
> +
> +	for (i = bar_len; i >= 8; i -= 8)
> +		printf("%s", bars[8]);
> +	if (i)
> +		printf("%s", bars[i]);
> +
> +	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +		putchar(' ');
> +
> +	putchar('|');
>   }
>   
> +#define DEFAULT_PERIOD_MS (1000)
> +
>   static void
>   usage(const char *appname)
>   {
>   	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -			"\n"
> -			"usage: %s [parameters]\n"
> -			"\n"
> -			"The following parameters apply:\n"
> -			"[-s <samples>]       samples per seconds (default %d)\n"
> -			"[-e <command>]       command to profile\n"
> -			"[-o <file>]          output statistics to file. If file is '-',"
> -			"                     run in batch mode and output statistics to stdio only \n"
> -			"[-h]                 show this help screen\n"
> -			"\n",
> -			appname,
> -			SAMPLES_PER_SEC
> -		  );
> -	return;
> +		"\n"
> +		"Usage: %s [parameters]\n"
> +		"\n"
> +		"\tThe following parameters are optional:\n"
> +		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
> +		"\t[-h]                 show this help text\n"
> +		"\n",
> +		appname, DEFAULT_PERIOD_MS);
>   }
>   
>   int main(int argc, char **argv)
>   {
> -	uint32_t devid;
> -	struct pci_device *pci_dev;
> -	struct ring render_ring = {
> -		.name = "render",
> -		.mmio = 0x2030,
> -	}, bsd_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x4030,
> -	}, bsd6_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x12030,
> -	}, blt_ring = {
> -		.name = "blitter",
> -		.mmio = 0x22030,
> -	};
> -	int i, ch;
> -	int samples_per_sec = SAMPLES_PER_SEC;
> -	FILE *output = NULL;
> -	double elapsed_time=0;
> -	int print_headers=1;
> -	pid_t child_pid=-1;
> -	int child_stat;
> -	char *cmd=NULL;
> -	int interactive=1;
> -
> -	/* Parse options? */
> -	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +	int con_w = -1, con_h = -1;
> +	struct engines *engines;
> +	struct winsize ws;
> +	unsigned int i;
> +	int ret, ch;
> +
> +	/* Parse options */
> +	while ((ch = getopt(argc, argv, "s:h")) != -1) {
>   		switch (ch) {
> -		case 'e': cmd = strdup(optarg);
> -			break;
> -		case 's': samples_per_sec = atoi(optarg);
> -			if (samples_per_sec < 100) {
> -				fprintf(stderr, "Error: samples per second must be >= 100\n");
> -				exit(1);
> -			}
> -			break;
> -		case 'o':
> -			if (!strcmp(optarg, "-")) {
> -				/* Running in non-interactive mode */
> -				interactive = 0;
> -				output = stdout;
> -			}
> -			else
> -				output = fopen(optarg, "w");
> -			if (!output)
> -			{
> -				perror("fopen");
> -				exit(1);
> -			}
> +		case 's':
> +			period_us = atoi(optarg) * 1000;
>   			break;
>   		case 'h':
>   			usage(argv[0]);
>   			exit(0);
> -			break;
>   		default:
> -			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>   			usage(argv[0]);
>   			exit(1);
> -			break;
>   		}
>   	}
>   
> -	pci_dev = intel_get_pci_device();
> -	devid = pci_dev->device_id;
> -	intel_mmio_use_pci_bar(pci_dev);
> -	init_instdone_definitions(devid);
> +	/* Get terminal size. */
> +	if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +		con_w = ws.ws_col;
> +		con_h = ws.ws_row;
> +	}
>   
> -	/* Do we have a command to run? */
> -	if (cmd != NULL) {
> -		if (output) {
> -			fprintf(output, "# Profiling: %s\n", cmd);
> -			fflush(output);
> -		}
> -		child_pid = fork();
> -		if (child_pid < 0) {
> -			perror("fork");
> -			exit(1);
> -		}
> -		else if (child_pid == 0) {
> -			int res;
> -			res = system(cmd);
> -			if (res < 0)
> -				perror("running command");
> -			if (output) {
> -				fflush(output);
> -				fprintf(output, "# %s exited with status %d\n", cmd, res);
> -				fflush(output);
> -			}
> -			free(cmd);
> -			exit(0);
> -		} else {
> -			free(cmd);
> -		}
> +	engines = discover_engines();
> +	if (!engines) {
> +		fprintf(stderr, "Failed to detect engines!\n");
> +		return 1;
>   	}
>   
> -	for (i = 0; i < num_instdone_bits; i++) {
> -		top_bits[i].bit = &instdone_bits[i];
> -		top_bits[i].count = 0;
> -		top_bits_sorted[i] = &top_bits[i];
> +	ret = pmu_init(engines);
> +	if (ret) {
> +		fprintf(stderr, "Failed to initialize PMU!\n");
> +		return 1;
>   	}
>   
> -	/* Grab access to the registers */
> -	intel_register_access_init(pci_dev, 0, -1);
> +	pmu_sample(engines);
>   
> -	ring_init(&render_ring);
> -	if (IS_GEN4(devid) || IS_GEN5(devid))
> -		ring_init(&bsd_ring);
> -	if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -		ring_init(&bsd6_ring);
> -		ring_init(&blt_ring);
> -	}
> +	for (;;) {
> +		double t, freq[2], irq, rc6, power;
> +		int lines = 0;
>   
> -	/* Initialize GPU stats */
> -	if (HAS_STATS_REGS(devid)) {
> -		for (i = 0; i < STATS_COUNT; i++) {
> -			uint32_t stats_high, stats_low, stats_high_2;
> +		usleep(period_us);
>   
> -			do {
> -				stats_high = INREG(stats_regs[i] + 4);
> -				stats_low = INREG(stats_regs[i]);
> -				stats_high_2 = INREG(stats_regs[i] + 4);
> -			} while (stats_high != stats_high_2);
> +		pmu_sample(engines);
> +		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>   
> -			last_stats[i] = (uint64_t)stats_high << 32 |
> -				stats_low;
> -		}
> -	}
> +		printf("\033[H\033[J");
>   
> -	for (;;) {
> -		int j;
> -		unsigned long long t1, ti, tf, t2;
> -		unsigned long long def_sleep = 1000000 / samples_per_sec;
> -		unsigned long long last_samples_per_sec = samples_per_sec;
> -		unsigned short int max_lines;
> -		struct winsize ws;
> -		char clear_screen[] = {0x1b, '[', 'H',
> -				       0x1b, '[', 'J',
> -				       0x0};
> -		int percent;
> -		int len;
> -
> -		t1 = gettime();
> -
> -		ring_reset(&render_ring);
> -		ring_reset(&bsd_ring);
> -		ring_reset(&bsd6_ring);
> -		ring_reset(&blt_ring);
> -
> -		for (i = 0; i < samples_per_sec; i++) {
> -			long long interval;
> -			ti = gettime();
> -			if (IS_965(devid)) {
> -				instdone = INREG(INSTDONE_I965);
> -				instdone1 = INREG(INSTDONE_1);
> -			} else
> -				instdone = INREG(INSTDONE);
> -
> -			for (j = 0; j < num_instdone_bits; j++)
> -				update_idle_bit(&top_bits[j]);
> -
> -			ring_sample(&render_ring);
> -			ring_sample(&bsd_ring);
> -			ring_sample(&bsd6_ring);
> -			ring_sample(&blt_ring);
> -
> -			tf = gettime();
> -			if (tf - t1 >= 1000000) {
> -				/* We are out of sync, bail out */
> -				last_samples_per_sec = i+1;
> -				break;
> -			}
> -			interval = def_sleep - (tf - ti);
> -			if (interval > 0)
> -				usleep(interval);
> -		}
> +		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
> +		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
> +		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
> +		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
> +		power = pmu_calc(&engines->rapl.val, 1.0, t,
> +				 engines->rapl_scale);
>   
> -		if (HAS_STATS_REGS(devid)) {
> -			for (i = 0; i < STATS_COUNT; i++) {
> -				uint32_t stats_high, stats_low, stats_high_2;
> +		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
> +		       freq[0], freq[1], rc6, power, irq);
> +		lines++;
>   
> -				do {
> -					stats_high = INREG(stats_regs[i] + 4);
> -					stats_low = INREG(stats_regs[i]);
> -					stats_high_2 = INREG(stats_regs[i] + 4);
> -				} while (stats_high != stats_high_2);
> +		printf("\n");
> +		lines++;
>   
> -				stats[i] = (uint64_t)stats_high << 32 |
> -					stats_low;
> -			}
> -		}
> +		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
> +			unsigned int max_w = con_w - 1;
> +			unsigned int len;
> +			double val[2];
> +			char buf[128];
>   
> -		qsort(top_bits_sorted, num_instdone_bits,
> -		      sizeof(struct top_bit *), top_bits_sort);
> -
> -		/* Limit the number of lines printed to the terminal height so the
> -		 * most important info (at the top) will stay on screen. */
> -		max_lines = -1;
> -		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -			max_lines = ws.ws_row - 6; /* exclude header lines */
> -		if (max_lines >= num_instdone_bits)
> -			max_lines = num_instdone_bits;
> -
> -		t2 = gettime();
> -		elapsed_time += (t2 - t1) / 1000000.0;
> -
> -		if (interactive) {
> -			printf("%s", clear_screen);
> -			print_clock_info(pci_dev);
> -
> -			ring_print(&render_ring, last_samples_per_sec);
> -			ring_print(&bsd_ring, last_samples_per_sec);
> -			ring_print(&bsd6_ring, last_samples_per_sec);
> -			ring_print(&blt_ring, last_samples_per_sec);
> -
> -			printf("\n%30s  %s\n", "task", "percent busy");
> -			for (i = 0; i < max_lines; i++) {
> -				if (top_bits_sorted[i]->count > 0) {
> -					percent = (top_bits_sorted[i]->count * 100) /
> -						last_samples_per_sec;
> -					len = printf("%30s: %3d%%: ",
> -							 top_bits_sorted[i]->bit->name,
> -							 percent);
> -					print_percentage_bar (percent, len);
> -				} else {
> -					printf("%*s", PERCENTAGE_BAR_END, "");
> -				}
> -
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					printf("%13s: %llu (%lld/sec)",
> -						   stats_reg_names[i],
> -						   (long long)stats[i],
> -						   (long long)(stats[i] - last_stats[i]));
> -					last_stats[i] = stats[i];
> -				} else {
> -					if (!top_bits_sorted[i]->count)
> -						break;
> -				}
> -				printf("\n");
> -			}
> -		}
> -		if (output) {
> -			/* Print headers for columns at first run */
> -			if (print_headers) {
> -				fprintf(output, "# time\t");
> -				ring_print_header(output, &render_ring);
> -				ring_print_header(output, &bsd_ring);
> -				ring_print_header(output, &bsd6_ring);
> -				ring_print_header(output, &blt_ring);
> -				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -						fprintf(output, "%.6s\t",
> -							   stats_reg_names[i]
> -							   );
> -					}
> -					if (!top_bits[i].count)
> -						continue;
> -				}
> -				fprintf(output, "\n");
> -				print_headers = 0;
> -			}
> -
> -			/* Print statistics */
> -			fprintf(output, "%.2f\t", elapsed_time);
> -			ring_log(&render_ring, last_samples_per_sec, output);
> -			ring_log(&bsd_ring, last_samples_per_sec, output);
> -			ring_log(&bsd6_ring, last_samples_per_sec, output);
> -			ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					fprintf(output, "%"PRIu64"\t",
> -						   stats[i] - last_stats[i]);
> -					last_stats[i] = stats[i];
> -				}
> -					if (!top_bits[i].count)
> -						continue;
> -			}
> -			fprintf(output, "\n");
> -			fflush(output);
> -		}
> +			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
> +			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
> +			len = snprintf(buf, sizeof(buf),
> +				       "%6.2f%% wait, %6.2f%% sema",
> +				       val[0], val[1]);
>   
> -		for (i = 0; i < num_instdone_bits; i++) {
> -			top_bits_sorted[i]->count = 0;
> +			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
> +			len += printf("%8s %6.2f%% ",
> +				      engine->name, val[0]);
> +			print_percentage_bar(val[0], max_w - len);
>   
> -			if (i < STATS_COUNT)
> -				last_stats[i] = stats[i];
> -		}
> +			printf("%s\n", buf);
>   
> -		/* Check if child has gone */
> -		if (child_pid > 0) {
> -			int res;
> -			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -				perror("waitpid");
> -				exit(1);
> -			}
> -			if (res == 0)
> -				continue;
> -			if (WIFEXITED(child_stat))
> -				break;
> +			lines++;
>   		}
> -	}
>   
> -	fclose(output);
> +		printf("\n");
> +	}
>   
> -	intel_register_access_fini();
>   	return 0;
>   }
> diff --git a/tools/meson.build b/tools/meson.build
> index bd2d313d5156..a918eeb0bef1 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -23,7 +23,6 @@ tools_progs = [
>   	'intel_gpu_frequency',
>   	'intel_firmware_decode',
>   	'intel_gpu_time',
> -	'intel_gpu_top',
>   	'intel_gtt',
>   	'intel_guc_logger',
>   	'intel_infoframes',
> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>   	       name_prefix : '',
>   	       install : true)
>   
> +executable('intel_gpu_top', 'intel_gpu_top.c',
> +	   install : true,
> +	   install_rpath : rpathdir,
> +	   dependencies : tool_deps + [ lib_igt_perf ])
> +
>   conf_data = configuration_data()
>   conf_data.set('prefix', prefix)
>   conf_data.set('exec_prefix', '${prefix}')

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [igt-dev] ✓ Fi.CI.IGT: success for intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (5 preceding siblings ...)
  (?)
@ 2018-03-29  9:46 ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-03-29  9:46 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use
URL   : https://patchwork.freedesktop.org/series/40826/
State : success

== Summary ==

---- Known issues:

Test gem_eio:
        Subgroup in-flight-suspend:
                pass       -> INCOMPLETE (shard-snb) fdo#103375
Test kms_cursor_legacy:
        Subgroup flip-vs-cursor-atomic:
                pass       -> FAIL       (shard-hsw) fdo#102670
Test kms_flip:
        Subgroup 2x-flip-vs-expired-vblank:
                fail       -> PASS       (shard-hsw) fdo#102887
        Subgroup dpms-vs-vblank-race:
                fail       -> PASS       (shard-hsw) fdo#103060
        Subgroup plain-flip-ts-check:
                fail       -> PASS       (shard-hsw) fdo#100368
Test kms_plane:
        Subgroup plane-panning-bottom-right-suspend-pipe-a-planes:
                pass       -> INCOMPLETE (shard-hsw) fdo#103540 +1
Test kms_rotation_crc:
        Subgroup sprite-rotation-180:
                pass       -> FAIL       (shard-snb) fdo#103925
Test kms_vblank:
        Subgroup pipe-a-accuracy-idle:
                fail       -> PASS       (shard-hsw) fdo#102583
Test pm_rps:
        Subgroup reset:
                pass       -> FAIL       (shard-hsw) fdo#102250

fdo#103375 https://bugs.freedesktop.org/show_bug.cgi?id=103375
fdo#102670 https://bugs.freedesktop.org/show_bug.cgi?id=102670
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#103060 https://bugs.freedesktop.org/show_bug.cgi?id=103060
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
fdo#102583 https://bugs.freedesktop.org/show_bug.cgi?id=102583
fdo#102250 https://bugs.freedesktop.org/show_bug.cgi?id=102250

shard-apl        total:3495 pass:1831 dwarn:1   dfail:0   fail:7   skip:1655 time:13007s
shard-hsw        total:3405 pass:1744 dwarn:1   dfail:0   fail:3   skip:1655 time:11323s
shard-snb        total:3476 pass:1368 dwarn:1   dfail:0   fail:3   skip:2103 time:6670s
Blacklisted hosts:
shard-kbl        total:3495 pass:1954 dwarn:1   dfail:0   fail:7   skip:1533 time:9465s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1204/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-03-29 10:33   ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-29 10:33 UTC (permalink / raw)
  To: igt-dev; +Cc: Rinat Ibragimov, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibgragimov)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1009 +++++++++++++++++++++----------------------------
 tools/meson.build     |    6 +-
 3 files changed, 441 insertions(+), 576 deletions(-)

diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..94091d97c4a3 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -18,701 +17,561 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
 
-static unsigned long
-gettime(void)
-{
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	int rapl_fd;
+	double rapl_scale;
 
-static int
-top_bits_sort(const void *a, const void *b)
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+
+	struct engine engine;
+};
+
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	char buf[128], *p;
+	int fd, ret;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
+
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) \
+	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+	int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
+		      I915_PMU_CLASS_SHIFT;
+	int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
+		      I915_PMU_CLASS_SHIFT;
+	int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
+			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
+	int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
+			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
+
+	if (class_a != class_b)
+		return class_a - class_b;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return instance_a - instance_b;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
-			break;
-		}
-
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		if (dent->d_type != DT_REG)
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
-			break;
-		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+		strcpy(buf, dent->d_name);
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		memset(engine, 0, sizeof(*engine));
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret)
+		free(engines);
+	else {
+		qsort(engine_ptr(engines, 0), engines->num_engines,
+		      sizeof(struct engine), engine_cmp);
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		engines->root = d;
 	}
 
+	return ret == 0 ? engines : NULL;
+}
+
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
+
+	buf[ret] = '\0';
 
-	printf("\n");
-	return -1;
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static uint64_t rapl_type_id(void)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
-	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
-	}
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
 }
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+#define __open_pmu(engines, pmu, idx) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
+	if (fd__ >= 0) { \
+		if ((engines)->fd == -1) \
+			(engines)->fd = fd__; \
+		(pmu)->idx = (idx)++; \
+		(engines)->num_counters++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
 {
-	return INREG(ring->mmio + reg);
+	unsigned int idx = 0;
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_req, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_act, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = __open_pmu(engines, &engines->irq, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	fd = __open_pmu(engines, &engines->rc6, idx);
+	if (fd < 0)
+		return -1;
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = __open_pmu(engines, cnt->pmu, idx);
+			if (fd < 0)
+				return -1;
+		}
+	}
+
+	engines->rapl_scale = rapl_gpu_power_scale();
+	if (engines->rapl_scale != NAN)
+		engines->rapl_scale *= 1e3; /* from nano to micro */
+	engines->rapl.config = rapl_gpu_power();
+	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
+	if (engines->rapl_fd < 0)
+		return -1;
+
+	return 0;
 }
 
-static void ring_init(struct ring *ring)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_reset(struct ring *ring)
+static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->idle = ring->full = 0;
+	double pct;
+
+	pct = p->cur - p->prev;
+	pct /= d;
+	pct /= t;
+	pct *= s;
+
+	if (s == 100.0 && pct > 100.0)
+		pct = 100.0;
+
+	return pct;
 }
 
-static void ring_sample(struct ring *ring)
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
 {
-	int full;
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	if (!ring->size)
-		return;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	if (ts)
+		*ts = data[1];
+
+	return data[0];
+}
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+static uint64_t pmu_read_single(int fd)
+{
+	return __pmu_read_single(fd, NULL);
+}
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
+{
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	__update_sample(counter, val[counter->idx]);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void pmu_sample(struct engines *engines)
 {
-	int percent_busy, len;
+	const int num_val = engines->num_counters;
+	uint64_t val[num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
 
-	if (!ring->size)
-		return;
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr, "Failed to detect engines!\n");
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr, "Failed to initialize PMU!\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t, freq[2], irq, rc6, power;
+		struct winsize ws;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
+		}
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
+		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
+		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
+		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
+		power = pmu_calc(&engines->rapl.val, 1.0, t,
+				 engines->rapl_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
+		       freq[0], freq[1], rc6, power, irq);
+		lines++;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		printf("\n");
+		lines++;
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			double val[2];
+			char buf[128];
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
+			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf),
+				       "%6.2f%% wait, %6.2f%% sema",
+				       val[0], val[1]);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
+			len += printf("%8s %6.2f%% ",
+				      engine->name, val[0]);
+			print_percentage_bar(val[0], max_w - len);
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
-		}
+			printf("%s\n", buf);
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
+			lines++;
 		}
-	}
 
-	fclose(output);
+		printf("\n");
+
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-29 10:33   ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-29 10:33 UTC (permalink / raw)
  To: igt-dev; +Cc: Tvrtko Ursulin, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibgragimov)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1009 +++++++++++++++++++++----------------------------
 tools/meson.build     |    6 +-
 3 files changed, 441 insertions(+), 576 deletions(-)

diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..94091d97c4a3 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -18,701 +17,561 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
 
-static unsigned long
-gettime(void)
-{
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	int rapl_fd;
+	double rapl_scale;
 
-static int
-top_bits_sort(const void *a, const void *b)
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+
+	struct engine engine;
+};
+
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	char buf[128], *p;
+	int fd, ret;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
+
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) \
+	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+	int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
+		      I915_PMU_CLASS_SHIFT;
+	int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
+		      I915_PMU_CLASS_SHIFT;
+	int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
+			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
+	int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
+			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
+
+	if (class_a != class_b)
+		return class_a - class_b;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return instance_a - instance_b;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
-			break;
-		}
-
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		if (dent->d_type != DT_REG)
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
-			break;
-		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+		strcpy(buf, dent->d_name);
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		memset(engine, 0, sizeof(*engine));
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret)
+		free(engines);
+	else {
+		qsort(engine_ptr(engines, 0), engines->num_engines,
+		      sizeof(struct engine), engine_cmp);
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		engines->root = d;
 	}
 
+	return ret == 0 ? engines : NULL;
+}
+
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
+
+	buf[ret] = '\0';
 
-	printf("\n");
-	return -1;
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static uint64_t rapl_type_id(void)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
-	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
-	}
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
 }
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+#define __open_pmu(engines, pmu, idx) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
+	if (fd__ >= 0) { \
+		if ((engines)->fd == -1) \
+			(engines)->fd = fd__; \
+		(pmu)->idx = (idx)++; \
+		(engines)->num_counters++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
 {
-	return INREG(ring->mmio + reg);
+	unsigned int idx = 0;
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_req, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	fd = __open_pmu(engines, &engines->freq_act, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = __open_pmu(engines, &engines->irq, idx);
+	if (fd < 0)
+		return -1;
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	fd = __open_pmu(engines, &engines->rc6, idx);
+	if (fd < 0)
+		return -1;
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = __open_pmu(engines, cnt->pmu, idx);
+			if (fd < 0)
+				return -1;
+		}
+	}
+
+	engines->rapl_scale = rapl_gpu_power_scale();
+	if (engines->rapl_scale != NAN)
+		engines->rapl_scale *= 1e3; /* from nano to micro */
+	engines->rapl.config = rapl_gpu_power();
+	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
+	if (engines->rapl_fd < 0)
+		return -1;
+
+	return 0;
 }
 
-static void ring_init(struct ring *ring)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_reset(struct ring *ring)
+static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->idle = ring->full = 0;
+	double pct;
+
+	pct = p->cur - p->prev;
+	pct /= d;
+	pct /= t;
+	pct *= s;
+
+	if (s == 100.0 && pct > 100.0)
+		pct = 100.0;
+
+	return pct;
 }
 
-static void ring_sample(struct ring *ring)
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
 {
-	int full;
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	if (!ring->size)
-		return;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	if (ts)
+		*ts = data[1];
+
+	return data[0];
+}
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+static uint64_t pmu_read_single(int fd)
+{
+	return __pmu_read_single(fd, NULL);
+}
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
+{
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	__update_sample(counter, val[counter->idx]);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void pmu_sample(struct engines *engines)
 {
-	int percent_busy, len;
+	const int num_val = engines->num_counters;
+	uint64_t val[num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
 
-	if (!ring->size)
-		return;
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr, "Failed to detect engines!\n");
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr, "Failed to initialize PMU!\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t, freq[2], irq, rc6, power;
+		struct winsize ws;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
+		}
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
+		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
+		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
+		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
+		power = pmu_calc(&engines->rapl.val, 1.0, t,
+				 engines->rapl_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
+		       freq[0], freq[1], rc6, power, irq);
+		lines++;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		printf("\n");
+		lines++;
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			double val[2];
+			char buf[128];
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
+			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf),
+				       "%6.2f%% wait, %6.2f%% sema",
+				       val[0], val[1]);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
+			len += printf("%8s %6.2f%% ",
+				      engine->name, val[0]);
+			print_percentage_bar(val[0], max_w - len);
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
-		}
+			printf("%s\n", buf);
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
+			lines++;
 		}
-	}
 
-	fclose(output);
+		printf("\n");
+
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 20:11 ` Rinat Ibragimov
@ 2018-03-29 10:49     ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-29 10:49 UTC (permalink / raw)
  To: Rinat Ibragimov, Tvrtko Ursulin; +Cc: igt-dev, Intel-gfx


On 28/03/2018 21:11, Rinat Ibragimov wrote:
> 
>> Среда, 28 марта 2018, 21:30 +03:00 от Tvrtko Ursulin <tursulin@ursulin.net>:
>>
> 
>> +static struct engines *discover_engines(void)
>>   {
>> -uint32_t devid = pci_dev->device_id;
>> -uint16_t gcfgc;
>> +const char *sysfs_root = "/sys/devices/i915/events";
> 
> Just a question.
> I think, I have Linux 4.15.11 (from Debian testing) now. And there are no such files.
> Are there any estimates about when this feature is expected to be available?

4.17 I think. I could make it work with 4.16 as well, if there would be 
demand, just need to ignore counters enumerated in sysfs but not 
actually present in hardware.

>> -static void ring_init(struct ring *ring)
>> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>>   {
>> -ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
>> +uint64_t buf[2 + num];
>> +unsigned int i;
>> +
>> +assert(read(fd, buf, sizeof(buf)) == sizeof(buf));
> 
> Will have undesired effects with NDEBUG.
> 
>> -int full;
>> +uint64_t data[2];
>>   
>> -if (!ring->size)
>> -return;
>> +assert(read(fd, data, sizeof(data)) == sizeof(data));
> 
> Same here.

Thanks, got a bit disconnected from userspace development patterns over 
the years.

> 
>> +/* Get terminal size. */
>> +if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>> +con_w = ws.ws_col;
>> +con_h = ws.ws_row;
>> +}
> 
> If you move this into the loop itself, the tool will adapt to changes in
> terminal width and height dynamically.

Makes sense, done in v2.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Intel-gfx] [igt-dev] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-29 10:49     ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-29 10:49 UTC (permalink / raw)
  To: Rinat Ibragimov, Tvrtko Ursulin; +Cc: igt-dev, Intel-gfx


On 28/03/2018 21:11, Rinat Ibragimov wrote:
> 
>> Среда, 28 марта 2018, 21:30 +03:00 от Tvrtko Ursulin <tursulin@ursulin.net>:
>>
> 
>> +static struct engines *discover_engines(void)
>>   {
>> -uint32_t devid = pci_dev->device_id;
>> -uint16_t gcfgc;
>> +const char *sysfs_root = "/sys/devices/i915/events";
> 
> Just a question.
> I think, I have Linux 4.15.11 (from Debian testing) now. And there are no such files.
> Are there any estimates about when this feature is expected to be available?

4.17 I think. I could make it work with 4.16 as well, if there would be 
demand, just need to ignore counters enumerated in sysfs but not 
actually present in hardware.

>> -static void ring_init(struct ring *ring)
>> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>>   {
>> -ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
>> +uint64_t buf[2 + num];
>> +unsigned int i;
>> +
>> +assert(read(fd, buf, sizeof(buf)) == sizeof(buf));
> 
> Will have undesired effects with NDEBUG.
> 
>> -int full;
>> +uint64_t data[2];
>>   
>> -if (!ring->size)
>> -return;
>> +assert(read(fd, data, sizeof(data)) == sizeof(data));
> 
> Same here.

Thanks, got a bit disconnected from userspace development patterns over 
the years.

> 
>> +/* Get terminal size. */
>> +if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>> +con_w = ws.ws_col;
>> +con_h = ws.ws_row;
>> +}
> 
> If you move this into the loop itself, the tool will adapt to changes in
> terminal width and height dynamically.

Makes sense, done in v2.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-29 10:33   ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-29 14:30     ` Eero Tamminen
  -1 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-03-29 14:30 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Rinat Ibragimov, Intel-gfx

Hi,

I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
and latest drm-tip build (4.16.0-rc7).


General comments
----------------

This will be used by our customers and people who aren't necessarily
familiar with i915 internal details.  Therefore it should use
common terminology in the field and in similar tools, instead of
I3As (Intel 3-letter Acronyms).

For example:
  - rcs -> 3D render
  - bcs -> blitter
  - vecs -> video
  - vcs -> video decode
etc.


Old tool showed also GPU system memory interface (GAM) busyness.
That was valuable info, and reasonably accurate for stable loads.

Could this tool show also either that information (preferred), or
bandwidth utilized by GPU/CPU/display?

(Latest kernels offer GPU memory bandwidth usage through perf
"uncore_imc" "data_reads" & "date_writes" counters.)


Is "wait" value supposed to be IO-wait for given engine interface?

I never saw that change from 0%, although IO-wait in top jumped
from 0 to 20-30% with my test GPU load.


HW specific test results
------------------------

BYT:
* Reports "Failed to initialize PMU!" although old intel_gpu_top
   works fine.

HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
for the "wait" value.

I never saw blitter engine to do anything, but that's because
modesetting uses just 3D pipeline, and because I couldn't get
Intel DDX to work with rest of latest git version of X / 3D stack.



Kernel version support
----------------------

My HW specific testing above was with drm-tip kernel, but I did one test
also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
backport) on KBL.  For that, the tool reported:
"Failed to detect engines!"

Although the previous intel_gpu_top works fine with that kernel version.

Same happens also with Ubuntu 17.04 v4.13 kernel.


-> If new version needs a certain kernel version, it should tell
    which version is required.



	- Eero

On 29.03.2018 13:33, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
> 
> v2:
>   * Sort engines by class and instance.
>   * Do not wait for one sampling period to display something on screen.
>   * Move code out of the asserts. (Rinat Ibragimov)
>   * Continuously adapt to terminal size. (Rinat Ibgragimov)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
> ---
>   tools/Makefile.am     |    2 +
>   tools/intel_gpu_top.c | 1009 +++++++++++++++++++++----------------------------
>   tools/meson.build     |    6 +-
>   3 files changed, 441 insertions(+), 576 deletions(-)
> 
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>   intel_aubdump_la_SOURCES = aubdump.c
>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>   
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>   bin_SCRIPTS = intel_aubdump
>   CLEANFILES = $(bin_SCRIPTS)
>   
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..94091d97c4a3 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>   /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2018 Intel Corporation
>    *
>    * Permission is hereby granted, free of charge, to any person obtaining a
>    * copy of this software and associated documentation files (the "Software"),
> @@ -18,701 +17,561 @@
>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - *
> - * Authors:
> - *    Eric Anholt <eric@anholt.net>
> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
>    */
>   
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>   #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>   #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE	    0xA18C
> -#define  FORCEWAKE_ACK	    0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)		IS_965(devid)
> -
> -struct top_bit {
> -	struct instdone_bit *bit;
> -	int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -	" ",
> -	"▏",
> -	"▎",
> -	"▍",
> -	"▌",
> -	"▋",
> -	"▊",
> -	"▉",
> -	"█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>   
> -enum stats_counts {
> -	IA_VERTICES,
> -	IA_PRIMITIVES,
> -	VS_INVOCATION,
> -	GS_INVOCATION,
> -	GS_PRIMITIVES,
> -	CL_INVOCATION,
> -	CL_PRIMITIVES,
> -	PS_INVOCATION,
> -	PS_DEPTH,
> -	STATS_COUNT
> +struct pmu_pair {
> +	uint64_t cur;
> +	uint64_t prev;
>   };
>   
> -const uint32_t stats_regs[STATS_COUNT] = {
> -	IA_VERTICES_COUNT_QW,
> -	IA_PRIMITIVES_COUNT_QW,
> -	VS_INVOCATION_COUNT_QW,
> -	GS_INVOCATION_COUNT_QW,
> -	GS_PRIMITIVES_COUNT_QW,
> -	CL_INVOCATION_COUNT_QW,
> -	CL_PRIMITIVES_COUNT_QW,
> -	PS_INVOCATION_COUNT_QW,
> -	PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +	uint64_t config;
> +	unsigned int idx;
> +	struct pmu_pair val;
>   };
>   
> -const char *stats_reg_names[STATS_COUNT] = {
> -	"vert fetch",
> -	"prim fetch",
> -	"VS invocations",
> -	"GS invocations",
> -	"GS prims",
> -	"CL invocations",
> -	"CL prims",
> -	"PS invocations",
> -	"PS depth pass",
> +struct engine {
> +	const char *name;
> +	struct pmu_counter busy;
> +	struct pmu_counter wait;
> +	struct pmu_counter sema;
>   };
>   
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +	unsigned int num_engines;
> +	unsigned int num_counters;
> +	DIR *root;
> +	int fd;
> +	struct pmu_pair ts;
>   
> -static unsigned long
> -gettime(void)
> -{
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +	int rapl_fd;
> +	double rapl_scale;
>   
> -static int
> -top_bits_sort(const void *a, const void *b)
> +	struct pmu_counter freq_req;
> +	struct pmu_counter freq_act;
> +	struct pmu_counter irq;
> +	struct pmu_counter rc6;
> +	struct pmu_counter rapl;
> +
> +	struct engine engine;
> +};
> +
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>   {
> -	struct top_bit * const *bit_a = a;
> -	struct top_bit * const *bit_b = b;
> -	int a_count = (*bit_a)->count;
> -	int b_count = (*bit_b)->count;
> +	char buf[128], *p;
> +	int fd, ret;
>   
> -	if (a_count < b_count)
> -		return 1;
> -	else if (a_count == b_count)
> -		return 0;
> -	else
> +	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +	if (ret < 0 || ret == sizeof(buf))
>   		return -1;
> -}
>   
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -	uint32_t reg_val;
> +	fd = openat(dirfd, buf, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -	if (top_bit->bit->reg == INSTDONE_1)
> -		reg_val = instdone1;
> -	else
> -		reg_val = instdone;
> +	ret = read(fd, buf, sizeof(buf));
> +	close(fd);
> +	if (ret <= 0)
> +		return -1;
> +
> +	p = index(buf, '0');
> +	if (!p)
> +		return -1;
>   
> -	if ((reg_val & top_bit->bit->bit) == 0)
> -		top_bit->count++;
> +	return strtoul(p, NULL, 0);
>   }
>   
> -static void
> -print_clock(const char *name, int clock) {
> -	if (clock == -1)
> -		printf("%s clock: unknown", name);
> +#define engine_ptr(engines, n) \
> +	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
> +
> +static int engine_cmp(const void *__a, const void *__b)
> +{
> +	const struct engine *a = (struct engine *)__a;
> +	const struct engine *b = (struct engine *)__b;
> +	int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
> +		      I915_PMU_CLASS_SHIFT;
> +	int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
> +		      I915_PMU_CLASS_SHIFT;
> +	int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
> +			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
> +	int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
> +			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
> +
> +	if (class_a != class_b)
> +		return class_a - class_b;
>   	else
> -		printf("%s clock: %d Mhz", name, clock);
> +		return instance_a - instance_b;
>   }
>   
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +static struct engines *discover_engines(void)
>   {
> -	uint32_t devid = pci_dev->device_id;
> -	uint16_t gcfgc;
> +	const char *sysfs_root = "/sys/devices/i915/events";
> +	struct engines *engines;
> +	struct dirent *dent;
> +	int ret = 0;
> +	DIR *d;
>   
> -	if (IS_GM45(devid)) {
> -		int core_clock = -1;
> +	engines = malloc(sizeof(struct engines));
> +	if (!engines)
> +		return NULL;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	memset(engines, 0, sizeof(*engines));
>   
> -		switch (gcfgc & 0xf) {
> -		case 8:
> -			core_clock = 266;
> -			break;
> -		case 9:
> -			core_clock = 320;
> -			break;
> -		case 11:
> -			core_clock = 400;
> -			break;
> -		case 13:
> -			core_clock = 533;
> -			break;
> -		}
> -		print_clock("core", core_clock);
> -	} else if (IS_965(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, sampler_clock = -1;
> +	engines->num_engines = 0;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	d = opendir(sysfs_root);
> +	if (!d)
> +		return NULL;
>   
> -		switch (gcfgc & 0xf) {
> -		case 2:
> -			render_clock = 250; sampler_clock = 267;
> -			break;
> -		case 3:
> -			render_clock = 320; sampler_clock = 333;
> -			break;
> -		case 4:
> -			render_clock = 400; sampler_clock = 444;
> -			break;
> -		case 5:
> -			render_clock = 500; sampler_clock = 533;
> -			break;
> -		}
> -
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("sampler", sampler_clock);
> -	} else if (IS_945(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +	while ((dent = readdir(d)) != NULL) {
> +		const char *endswith = "-busy";
> +		const unsigned int endlen = strlen(endswith);
> +		struct engine *engine =
> +				engine_ptr(engines, engines->num_engines);
> +		char buf[256];
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		if (dent->d_type != DT_REG)
> +			continue;
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 166;
> -			break;
> -		case 1:
> -			render_clock = 200;
> -			break;
> -		case 3:
> -			render_clock = 250;
> -			break;
> -		case 5:
> -			render_clock = 400;
> +		if (strlen(dent->d_name) >= sizeof(buf)) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 200;
> -			break;
> -		case 4:
> -			display_clock = 320;
> -			break;
> -		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +		strcpy(buf, dent->d_name);
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	} else if (IS_915(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		/* xxxN-busy */
> +		if (strlen(buf) < (endlen + 4))
> +			continue;
> +		if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +			continue;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		memset(engine, 0, sizeof(*engine));
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 160;
> -			break;
> -		case 1:
> -			render_clock = 190;
> -			break;
> -		case 4:
> -			render_clock = 333;
> +		buf[strlen(buf) - endlen] = 0;
> +		engine->name = strdup(buf);
> +		if (!engine->name) {
> +			ret = -1;
>   			break;
>   		}
> -		if (gcfgc & (1 << 13))
> -		    render_clock = 133;
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 190;
> +		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +						     "busy");
> +		if (engine->busy.config == -1) {
> +			ret = -1;
>   			break;
> -		case 4:
> -			display_clock = 333;
> +		}
> +
> +		engines->num_engines++;
> +		engines = realloc(engines, sizeof(struct engines) +
> +				  engines->num_engines * sizeof(struct engine));
> +		if (!engines) {
> +			ret = -ENOMEM;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	}
> +
> +	if (ret)
> +		free(engines);
> +	else {
> +		qsort(engine_ptr(engines, 0), engines->num_engines,
> +		      sizeof(struct engine), engine_cmp);
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> +		engines->root = d;
>   	}
>   
> +	return ret == 0 ? engines : NULL;
> +}
> +
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +	int fd;
> +	ssize_t ret;
> +
> +	fd = open(filename, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
> +
> +	ret = read(fd, buf, bufsize - 1);
> +	close(fd);
> +	if (ret < 1)
> +		return -1;
> +
> +	buf[ret] = '\0';
>   
> -	printf("\n");
> -	return -1;
> +	return 0;
>   }
>   
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
> +static uint64_t filename_to_u64(const char *filename, int base)
> +{
> +	char buf[64], *b;
>   
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	/*
> +	 * Handle both single integer and key=value formats by skipping
> +	 * leading non-digits.
> +	 */
> +	b = buf;
> +	while (*b && !isdigit(*b))
> +		b++;
> +
> +	return strtoull(b, NULL, base);
> +}
> +
> +static uint64_t rapl_type_id(void)
>   {
> -	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -	int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -	int i;
> +	return filename_to_u64("/sys/devices/power/type", 10);
> +}
>   
> -	for (i = bar_len; i >= 8; i -= 8) {
> -		printf("%s", bars[8]);
> -		cur_line_len++;
> -	}
> -	if (i) {
> -		printf("%s", bars[i]);
> -		cur_line_len++;
> -	}
> +static uint64_t rapl_gpu_power(void)
> +{
> +	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +	char *oldlocale;
> +	char buf[80];
> +	double v;
> +
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
>   
> -	/* NB: We can't use a field width with utf8 so we manually
> -	* guarantee a field with of 45 chars for any bar. */
> -	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> +	oldlocale = setlocale(LC_ALL, "C");
> +	v = strtod(buf, NULL);
> +	setlocale(LC_ALL, oldlocale);
> +
> +	return v;
>   }
>   
> -struct ring {
> -	const char *name;
> -	uint32_t mmio;
> -	int head, tail, size;
> -	uint64_t full;
> -	int idle;
> -};
> +static double rapl_gpu_power_scale(void)
> +{
> +	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
> +}
>   
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +#define __open_pmu(engines, pmu, idx) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
> +	if (fd__ >= 0) { \
> +		if ((engines)->fd == -1) \
> +			(engines)->fd = fd__; \
> +		(pmu)->idx = (idx)++; \
> +		(engines)->num_counters++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
>   {
> -	return INREG(ring->mmio + reg);
> +	unsigned int idx = 0;
> +	unsigned int i;
> +	int fd;
> +
> +	engines->fd = -1;
> +	engines->num_counters = 0;
> +
> +	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_req, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_act, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->irq.config = I915_PMU_INTERRUPTS;
> +	fd = __open_pmu(engines, &engines->irq, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +	fd = __open_pmu(engines, &engines->rc6, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +		struct {
> +			struct pmu_counter *pmu;
> +			const char *counter;
> +		} *cnt, counters[] = {
> +			{ .pmu = &engine->busy, .counter = "busy" },
> +			{ .pmu = &engine->wait, .counter = "wait" },
> +			{ .pmu = &engine->sema, .counter = "sema" },
> +			{ .pmu = NULL, .counter = NULL },
> +		};
> +
> +		for (cnt = counters; cnt->pmu; cnt++) {
> +			if (!cnt->pmu->config)
> +				cnt->pmu->config =
> +					get_pmu_config(dirfd(engines->root),
> +						       engine->name,
> +						       cnt->counter);
> +			fd = __open_pmu(engines, cnt->pmu, idx);
> +			if (fd < 0)
> +				return -1;
> +		}
> +	}
> +
> +	engines->rapl_scale = rapl_gpu_power_scale();
> +	if (engines->rapl_scale != NAN)
> +		engines->rapl_scale *= 1e3; /* from nano to micro */
> +	engines->rapl.config = rapl_gpu_power();
> +	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
> +	if (engines->rapl_fd < 0)
> +		return -1;
> +
> +	return 0;
>   }
>   
> -static void ring_init(struct ring *ring)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>   {
> -	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +	uint64_t buf[2 + num];
> +	unsigned int i;
> +	ssize_t len;
> +
> +	memset(buf, 0, sizeof(buf));
> +
> +	len = read(fd, buf, sizeof(buf));
> +	assert(len == sizeof(buf));
> +
> +	for (i = 0; i < num; i++)
> +		val[i] = buf[2 + i];
> +
> +	return buf[1];
>   }
>   
> -static void ring_reset(struct ring *ring)
> +static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
>   {
> -	ring->idle = ring->full = 0;
> +	double pct;
> +
> +	pct = p->cur - p->prev;
> +	pct /= d;
> +	pct /= t;
> +	pct *= s;
> +
> +	if (s == 100.0 && pct > 100.0)
> +		pct = 100.0;
> +
> +	return pct;
>   }
>   
> -static void ring_sample(struct ring *ring)
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>   {
> -	int full;
> +	uint64_t data[2] = { };
> +	ssize_t len;
>   
> -	if (!ring->size)
> -		return;
> +	len = read(fd, data, sizeof(data));
> +	assert(len == sizeof(data));
>   
> -	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +	if (ts)
> +		*ts = data[1];
> +
> +	return data[0];
> +}
>   
> -	if (ring->tail == ring->head)
> -		ring->idle++;
> +static uint64_t pmu_read_single(int fd)
> +{
> +	return __pmu_read_single(fd, NULL);
> +}
>   
> -	full = ring->tail - ring->head;
> -	if (full < 0)
> -		full += ring->size;
> -	ring->full += full;
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
> +{
> +	counter->val.prev = counter->val.cur;
> +	counter->val.cur = val;
>   }
>   
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>   {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +	__update_sample(counter, val[counter->idx]);
>   }
>   
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void pmu_sample(struct engines *engines)
>   {
> -	int percent_busy, len;
> +	const int num_val = engines->num_counters;
> +	uint64_t val[num_val];
> +	unsigned int i;
> +
> +	engines->ts.prev = engines->ts.cur;
> +	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
> +
> +	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>   
> -	if (!ring->size)
> -		return;
> +	update_sample(&engines->freq_req, val);
> +	update_sample(&engines->freq_act, val);
> +	update_sample(&engines->irq, val);
> +	update_sample(&engines->rc6, val);
>   
> -	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
>   
> -	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -	print_percentage_bar (percent_busy, len);
> -	printf("%24s space: %d/%d\n",
> -		   ring->name,
> -		   (int)(ring->full / samples_per_sec),
> -		   ring->size);
> +		update_sample(&engine->busy, val);
> +		update_sample(&engine->sema, val);
> +		update_sample(&engine->wait, val);
> +	}
>   }
>   
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -		FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>   {
> -	if (ring->size)
> -		fprintf(output, "%3d\t%d\t",
> -			(int)(100 - 100 * ring->idle / samples_per_sec),
> -			(int)(ring->full / samples_per_sec));
> -	else
> -		fprintf(output, "-1\t-1\t");
> +	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +	int i;
> +
> +	putchar('|');
> +
> +	for (i = bar_len; i >= 8; i -= 8)
> +		printf("%s", bars[8]);
> +	if (i)
> +		printf("%s", bars[i]);
> +
> +	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +		putchar(' ');
> +
> +	putchar('|');
>   }
>   
> +#define DEFAULT_PERIOD_MS (1000)
> +
>   static void
>   usage(const char *appname)
>   {
>   	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -			"\n"
> -			"usage: %s [parameters]\n"
> -			"\n"
> -			"The following parameters apply:\n"
> -			"[-s <samples>]       samples per seconds (default %d)\n"
> -			"[-e <command>]       command to profile\n"
> -			"[-o <file>]          output statistics to file. If file is '-',"
> -			"                     run in batch mode and output statistics to stdio only \n"
> -			"[-h]                 show this help screen\n"
> -			"\n",
> -			appname,
> -			SAMPLES_PER_SEC
> -		  );
> -	return;
> +		"\n"
> +		"Usage: %s [parameters]\n"
> +		"\n"
> +		"\tThe following parameters are optional:\n"
> +		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
> +		"\t[-h]                 show this help text\n"
> +		"\n",
> +		appname, DEFAULT_PERIOD_MS);
>   }
>   
>   int main(int argc, char **argv)
>   {
> -	uint32_t devid;
> -	struct pci_device *pci_dev;
> -	struct ring render_ring = {
> -		.name = "render",
> -		.mmio = 0x2030,
> -	}, bsd_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x4030,
> -	}, bsd6_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x12030,
> -	}, blt_ring = {
> -		.name = "blitter",
> -		.mmio = 0x22030,
> -	};
> -	int i, ch;
> -	int samples_per_sec = SAMPLES_PER_SEC;
> -	FILE *output = NULL;
> -	double elapsed_time=0;
> -	int print_headers=1;
> -	pid_t child_pid=-1;
> -	int child_stat;
> -	char *cmd=NULL;
> -	int interactive=1;
> -
> -	/* Parse options? */
> -	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +	int con_w = -1, con_h = -1;
> +	struct engines *engines;
> +	unsigned int i;
> +	int ret, ch;
> +
> +	/* Parse options */
> +	while ((ch = getopt(argc, argv, "s:h")) != -1) {
>   		switch (ch) {
> -		case 'e': cmd = strdup(optarg);
> -			break;
> -		case 's': samples_per_sec = atoi(optarg);
> -			if (samples_per_sec < 100) {
> -				fprintf(stderr, "Error: samples per second must be >= 100\n");
> -				exit(1);
> -			}
> -			break;
> -		case 'o':
> -			if (!strcmp(optarg, "-")) {
> -				/* Running in non-interactive mode */
> -				interactive = 0;
> -				output = stdout;
> -			}
> -			else
> -				output = fopen(optarg, "w");
> -			if (!output)
> -			{
> -				perror("fopen");
> -				exit(1);
> -			}
> +		case 's':
> +			period_us = atoi(optarg) * 1000;
>   			break;
>   		case 'h':
>   			usage(argv[0]);
>   			exit(0);
> -			break;
>   		default:
> -			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>   			usage(argv[0]);
>   			exit(1);
> -			break;
>   		}
>   	}
>   
> -	pci_dev = intel_get_pci_device();
> -	devid = pci_dev->device_id;
> -	intel_mmio_use_pci_bar(pci_dev);
> -	init_instdone_definitions(devid);
> -
> -	/* Do we have a command to run? */
> -	if (cmd != NULL) {
> -		if (output) {
> -			fprintf(output, "# Profiling: %s\n", cmd);
> -			fflush(output);
> -		}
> -		child_pid = fork();
> -		if (child_pid < 0) {
> -			perror("fork");
> -			exit(1);
> -		}
> -		else if (child_pid == 0) {
> -			int res;
> -			res = system(cmd);
> -			if (res < 0)
> -				perror("running command");
> -			if (output) {
> -				fflush(output);
> -				fprintf(output, "# %s exited with status %d\n", cmd, res);
> -				fflush(output);
> -			}
> -			free(cmd);
> -			exit(0);
> -		} else {
> -			free(cmd);
> -		}
> +	engines = discover_engines();
> +	if (!engines) {
> +		fprintf(stderr, "Failed to detect engines!\n");
> +		return 1;
>   	}
>   
> -	for (i = 0; i < num_instdone_bits; i++) {
> -		top_bits[i].bit = &instdone_bits[i];
> -		top_bits[i].count = 0;
> -		top_bits_sorted[i] = &top_bits[i];
> +	ret = pmu_init(engines);
> +	if (ret) {
> +		fprintf(stderr, "Failed to initialize PMU!\n");
> +		return 1;
>   	}
>   
> -	/* Grab access to the registers */
> -	intel_register_access_init(pci_dev, 0, -1);
> +	pmu_sample(engines);
>   
> -	ring_init(&render_ring);
> -	if (IS_GEN4(devid) || IS_GEN5(devid))
> -		ring_init(&bsd_ring);
> -	if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -		ring_init(&bsd6_ring);
> -		ring_init(&blt_ring);
> -	}
> +	for (;;) {
> +		double t, freq[2], irq, rc6, power;
> +		struct winsize ws;
> +		int lines = 0;
>   
> -	/* Initialize GPU stats */
> -	if (HAS_STATS_REGS(devid)) {
> -		for (i = 0; i < STATS_COUNT; i++) {
> -			uint32_t stats_high, stats_low, stats_high_2;
> +		/* Update terminal size. */
> +		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +			con_w = ws.ws_col;
> +			con_h = ws.ws_row;
> +		}
>   
> -			do {
> -				stats_high = INREG(stats_regs[i] + 4);
> -				stats_low = INREG(stats_regs[i]);
> -				stats_high_2 = INREG(stats_regs[i] + 4);
> -			} while (stats_high != stats_high_2);
> +		pmu_sample(engines);
> +		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>   
> -			last_stats[i] = (uint64_t)stats_high << 32 |
> -				stats_low;
> -		}
> -	}
> +		printf("\033[H\033[J");
>   
> -	for (;;) {
> -		int j;
> -		unsigned long long t1, ti, tf, t2;
> -		unsigned long long def_sleep = 1000000 / samples_per_sec;
> -		unsigned long long last_samples_per_sec = samples_per_sec;
> -		unsigned short int max_lines;
> -		struct winsize ws;
> -		char clear_screen[] = {0x1b, '[', 'H',
> -				       0x1b, '[', 'J',
> -				       0x0};
> -		int percent;
> -		int len;
> -
> -		t1 = gettime();
> -
> -		ring_reset(&render_ring);
> -		ring_reset(&bsd_ring);
> -		ring_reset(&bsd6_ring);
> -		ring_reset(&blt_ring);
> -
> -		for (i = 0; i < samples_per_sec; i++) {
> -			long long interval;
> -			ti = gettime();
> -			if (IS_965(devid)) {
> -				instdone = INREG(INSTDONE_I965);
> -				instdone1 = INREG(INSTDONE_1);
> -			} else
> -				instdone = INREG(INSTDONE);
> -
> -			for (j = 0; j < num_instdone_bits; j++)
> -				update_idle_bit(&top_bits[j]);
> -
> -			ring_sample(&render_ring);
> -			ring_sample(&bsd_ring);
> -			ring_sample(&bsd6_ring);
> -			ring_sample(&blt_ring);
> -
> -			tf = gettime();
> -			if (tf - t1 >= 1000000) {
> -				/* We are out of sync, bail out */
> -				last_samples_per_sec = i+1;
> -				break;
> -			}
> -			interval = def_sleep - (tf - ti);
> -			if (interval > 0)
> -				usleep(interval);
> -		}
> +		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
> +		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
> +		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
> +		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
> +		power = pmu_calc(&engines->rapl.val, 1.0, t,
> +				 engines->rapl_scale);
>   
> -		if (HAS_STATS_REGS(devid)) {
> -			for (i = 0; i < STATS_COUNT; i++) {
> -				uint32_t stats_high, stats_low, stats_high_2;
> +		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
> +		       freq[0], freq[1], rc6, power, irq);
> +		lines++;
>   
> -				do {
> -					stats_high = INREG(stats_regs[i] + 4);
> -					stats_low = INREG(stats_regs[i]);
> -					stats_high_2 = INREG(stats_regs[i] + 4);
> -				} while (stats_high != stats_high_2);
> +		printf("\n");
> +		lines++;
>   
> -				stats[i] = (uint64_t)stats_high << 32 |
> -					stats_low;
> -			}
> -		}
> +		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
> +			unsigned int max_w = con_w - 1;
> +			unsigned int len;
> +			double val[2];
> +			char buf[128];
>   
> -		qsort(top_bits_sorted, num_instdone_bits,
> -		      sizeof(struct top_bit *), top_bits_sort);
> -
> -		/* Limit the number of lines printed to the terminal height so the
> -		 * most important info (at the top) will stay on screen. */
> -		max_lines = -1;
> -		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -			max_lines = ws.ws_row - 6; /* exclude header lines */
> -		if (max_lines >= num_instdone_bits)
> -			max_lines = num_instdone_bits;
> -
> -		t2 = gettime();
> -		elapsed_time += (t2 - t1) / 1000000.0;
> -
> -		if (interactive) {
> -			printf("%s", clear_screen);
> -			print_clock_info(pci_dev);
> -
> -			ring_print(&render_ring, last_samples_per_sec);
> -			ring_print(&bsd_ring, last_samples_per_sec);
> -			ring_print(&bsd6_ring, last_samples_per_sec);
> -			ring_print(&blt_ring, last_samples_per_sec);
> -
> -			printf("\n%30s  %s\n", "task", "percent busy");
> -			for (i = 0; i < max_lines; i++) {
> -				if (top_bits_sorted[i]->count > 0) {
> -					percent = (top_bits_sorted[i]->count * 100) /
> -						last_samples_per_sec;
> -					len = printf("%30s: %3d%%: ",
> -							 top_bits_sorted[i]->bit->name,
> -							 percent);
> -					print_percentage_bar (percent, len);
> -				} else {
> -					printf("%*s", PERCENTAGE_BAR_END, "");
> -				}
> -
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					printf("%13s: %llu (%lld/sec)",
> -						   stats_reg_names[i],
> -						   (long long)stats[i],
> -						   (long long)(stats[i] - last_stats[i]));
> -					last_stats[i] = stats[i];
> -				} else {
> -					if (!top_bits_sorted[i]->count)
> -						break;
> -				}
> -				printf("\n");
> -			}
> -		}
> -		if (output) {
> -			/* Print headers for columns at first run */
> -			if (print_headers) {
> -				fprintf(output, "# time\t");
> -				ring_print_header(output, &render_ring);
> -				ring_print_header(output, &bsd_ring);
> -				ring_print_header(output, &bsd6_ring);
> -				ring_print_header(output, &blt_ring);
> -				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -						fprintf(output, "%.6s\t",
> -							   stats_reg_names[i]
> -							   );
> -					}
> -					if (!top_bits[i].count)
> -						continue;
> -				}
> -				fprintf(output, "\n");
> -				print_headers = 0;
> -			}
> -
> -			/* Print statistics */
> -			fprintf(output, "%.2f\t", elapsed_time);
> -			ring_log(&render_ring, last_samples_per_sec, output);
> -			ring_log(&bsd_ring, last_samples_per_sec, output);
> -			ring_log(&bsd6_ring, last_samples_per_sec, output);
> -			ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					fprintf(output, "%"PRIu64"\t",
> -						   stats[i] - last_stats[i]);
> -					last_stats[i] = stats[i];
> -				}
> -					if (!top_bits[i].count)
> -						continue;
> -			}
> -			fprintf(output, "\n");
> -			fflush(output);
> -		}
> +			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
> +			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
> +			len = snprintf(buf, sizeof(buf),
> +				       "%6.2f%% wait, %6.2f%% sema",
> +				       val[0], val[1]);
>   
> -		for (i = 0; i < num_instdone_bits; i++) {
> -			top_bits_sorted[i]->count = 0;
> +			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
> +			len += printf("%8s %6.2f%% ",
> +				      engine->name, val[0]);
> +			print_percentage_bar(val[0], max_w - len);
>   
> -			if (i < STATS_COUNT)
> -				last_stats[i] = stats[i];
> -		}
> +			printf("%s\n", buf);
>   
> -		/* Check if child has gone */
> -		if (child_pid > 0) {
> -			int res;
> -			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -				perror("waitpid");
> -				exit(1);
> -			}
> -			if (res == 0)
> -				continue;
> -			if (WIFEXITED(child_stat))
> -				break;
> +			lines++;
>   		}
> -	}
>   
> -	fclose(output);
> +		printf("\n");
> +
> +		usleep(period_us);
> +	}
>   
> -	intel_register_access_fini();
>   	return 0;
>   }
> diff --git a/tools/meson.build b/tools/meson.build
> index bd2d313d5156..a918eeb0bef1 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -23,7 +23,6 @@ tools_progs = [
>   	'intel_gpu_frequency',
>   	'intel_firmware_decode',
>   	'intel_gpu_time',
> -	'intel_gpu_top',
>   	'intel_gtt',
>   	'intel_guc_logger',
>   	'intel_infoframes',
> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>   	       name_prefix : '',
>   	       install : true)
>   
> +executable('intel_gpu_top', 'intel_gpu_top.c',
> +	   install : true,
> +	   install_rpath : rpathdir,
> +	   dependencies : tool_deps + [ lib_igt_perf ])
> +
>   conf_data = configuration_data()
>   conf_data.set('prefix', prefix)
>   conf_data.set('exec_prefix', '${prefix}')
> 

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Intel-gfx] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-29 14:30     ` Eero Tamminen
  0 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-03-29 14:30 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Rinat Ibragimov, Intel-gfx

Hi,

I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
and latest drm-tip build (4.16.0-rc7).


General comments
----------------

This will be used by our customers and people who aren't necessarily
familiar with i915 internal details.  Therefore it should use
common terminology in the field and in similar tools, instead of
I3As (Intel 3-letter Acronyms).

For example:
  - rcs -> 3D render
  - bcs -> blitter
  - vecs -> video
  - vcs -> video decode
etc.


Old tool showed also GPU system memory interface (GAM) busyness.
That was valuable info, and reasonably accurate for stable loads.

Could this tool show also either that information (preferred), or
bandwidth utilized by GPU/CPU/display?

(Latest kernels offer GPU memory bandwidth usage through perf
"uncore_imc" "data_reads" & "date_writes" counters.)


Is "wait" value supposed to be IO-wait for given engine interface?

I never saw that change from 0%, although IO-wait in top jumped
from 0 to 20-30% with my test GPU load.


HW specific test results
------------------------

BYT:
* Reports "Failed to initialize PMU!" although old intel_gpu_top
   works fine.

HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
for the "wait" value.

I never saw blitter engine to do anything, but that's because
modesetting uses just 3D pipeline, and because I couldn't get
Intel DDX to work with rest of latest git version of X / 3D stack.



Kernel version support
----------------------

My HW specific testing above was with drm-tip kernel, but I did one test
also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
backport) on KBL.  For that, the tool reported:
"Failed to detect engines!"

Although the previous intel_gpu_top works fine with that kernel version.

Same happens also with Ubuntu 17.04 v4.13 kernel.


-> If new version needs a certain kernel version, it should tell
    which version is required.



	- Eero

On 29.03.2018 13:33, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
> 
> v2:
>   * Sort engines by class and instance.
>   * Do not wait for one sampling period to display something on screen.
>   * Move code out of the asserts. (Rinat Ibragimov)
>   * Continuously adapt to terminal size. (Rinat Ibgragimov)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
> ---
>   tools/Makefile.am     |    2 +
>   tools/intel_gpu_top.c | 1009 +++++++++++++++++++++----------------------------
>   tools/meson.build     |    6 +-
>   3 files changed, 441 insertions(+), 576 deletions(-)
> 
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>   intel_aubdump_la_SOURCES = aubdump.c
>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>   
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>   bin_SCRIPTS = intel_aubdump
>   CLEANFILES = $(bin_SCRIPTS)
>   
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..94091d97c4a3 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>   /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2018 Intel Corporation
>    *
>    * Permission is hereby granted, free of charge, to any person obtaining a
>    * copy of this software and associated documentation files (the "Software"),
> @@ -18,701 +17,561 @@
>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - *
> - * Authors:
> - *    Eric Anholt <eric@anholt.net>
> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
>    */
>   
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>   #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>   #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE	    0xA18C
> -#define  FORCEWAKE_ACK	    0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)		IS_965(devid)
> -
> -struct top_bit {
> -	struct instdone_bit *bit;
> -	int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -	" ",
> -	"▏",
> -	"▎",
> -	"▍",
> -	"▌",
> -	"▋",
> -	"▊",
> -	"▉",
> -	"█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>   
> -enum stats_counts {
> -	IA_VERTICES,
> -	IA_PRIMITIVES,
> -	VS_INVOCATION,
> -	GS_INVOCATION,
> -	GS_PRIMITIVES,
> -	CL_INVOCATION,
> -	CL_PRIMITIVES,
> -	PS_INVOCATION,
> -	PS_DEPTH,
> -	STATS_COUNT
> +struct pmu_pair {
> +	uint64_t cur;
> +	uint64_t prev;
>   };
>   
> -const uint32_t stats_regs[STATS_COUNT] = {
> -	IA_VERTICES_COUNT_QW,
> -	IA_PRIMITIVES_COUNT_QW,
> -	VS_INVOCATION_COUNT_QW,
> -	GS_INVOCATION_COUNT_QW,
> -	GS_PRIMITIVES_COUNT_QW,
> -	CL_INVOCATION_COUNT_QW,
> -	CL_PRIMITIVES_COUNT_QW,
> -	PS_INVOCATION_COUNT_QW,
> -	PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +	uint64_t config;
> +	unsigned int idx;
> +	struct pmu_pair val;
>   };
>   
> -const char *stats_reg_names[STATS_COUNT] = {
> -	"vert fetch",
> -	"prim fetch",
> -	"VS invocations",
> -	"GS invocations",
> -	"GS prims",
> -	"CL invocations",
> -	"CL prims",
> -	"PS invocations",
> -	"PS depth pass",
> +struct engine {
> +	const char *name;
> +	struct pmu_counter busy;
> +	struct pmu_counter wait;
> +	struct pmu_counter sema;
>   };
>   
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +	unsigned int num_engines;
> +	unsigned int num_counters;
> +	DIR *root;
> +	int fd;
> +	struct pmu_pair ts;
>   
> -static unsigned long
> -gettime(void)
> -{
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +	int rapl_fd;
> +	double rapl_scale;
>   
> -static int
> -top_bits_sort(const void *a, const void *b)
> +	struct pmu_counter freq_req;
> +	struct pmu_counter freq_act;
> +	struct pmu_counter irq;
> +	struct pmu_counter rc6;
> +	struct pmu_counter rapl;
> +
> +	struct engine engine;
> +};
> +
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>   {
> -	struct top_bit * const *bit_a = a;
> -	struct top_bit * const *bit_b = b;
> -	int a_count = (*bit_a)->count;
> -	int b_count = (*bit_b)->count;
> +	char buf[128], *p;
> +	int fd, ret;
>   
> -	if (a_count < b_count)
> -		return 1;
> -	else if (a_count == b_count)
> -		return 0;
> -	else
> +	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +	if (ret < 0 || ret == sizeof(buf))
>   		return -1;
> -}
>   
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -	uint32_t reg_val;
> +	fd = openat(dirfd, buf, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -	if (top_bit->bit->reg == INSTDONE_1)
> -		reg_val = instdone1;
> -	else
> -		reg_val = instdone;
> +	ret = read(fd, buf, sizeof(buf));
> +	close(fd);
> +	if (ret <= 0)
> +		return -1;
> +
> +	p = index(buf, '0');
> +	if (!p)
> +		return -1;
>   
> -	if ((reg_val & top_bit->bit->bit) == 0)
> -		top_bit->count++;
> +	return strtoul(p, NULL, 0);
>   }
>   
> -static void
> -print_clock(const char *name, int clock) {
> -	if (clock == -1)
> -		printf("%s clock: unknown", name);
> +#define engine_ptr(engines, n) \
> +	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
> +
> +static int engine_cmp(const void *__a, const void *__b)
> +{
> +	const struct engine *a = (struct engine *)__a;
> +	const struct engine *b = (struct engine *)__b;
> +	int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
> +		      I915_PMU_CLASS_SHIFT;
> +	int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
> +		      I915_PMU_CLASS_SHIFT;
> +	int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
> +			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
> +	int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
> +			 ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
> +
> +	if (class_a != class_b)
> +		return class_a - class_b;
>   	else
> -		printf("%s clock: %d Mhz", name, clock);
> +		return instance_a - instance_b;
>   }
>   
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +static struct engines *discover_engines(void)
>   {
> -	uint32_t devid = pci_dev->device_id;
> -	uint16_t gcfgc;
> +	const char *sysfs_root = "/sys/devices/i915/events";
> +	struct engines *engines;
> +	struct dirent *dent;
> +	int ret = 0;
> +	DIR *d;
>   
> -	if (IS_GM45(devid)) {
> -		int core_clock = -1;
> +	engines = malloc(sizeof(struct engines));
> +	if (!engines)
> +		return NULL;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	memset(engines, 0, sizeof(*engines));
>   
> -		switch (gcfgc & 0xf) {
> -		case 8:
> -			core_clock = 266;
> -			break;
> -		case 9:
> -			core_clock = 320;
> -			break;
> -		case 11:
> -			core_clock = 400;
> -			break;
> -		case 13:
> -			core_clock = 533;
> -			break;
> -		}
> -		print_clock("core", core_clock);
> -	} else if (IS_965(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, sampler_clock = -1;
> +	engines->num_engines = 0;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	d = opendir(sysfs_root);
> +	if (!d)
> +		return NULL;
>   
> -		switch (gcfgc & 0xf) {
> -		case 2:
> -			render_clock = 250; sampler_clock = 267;
> -			break;
> -		case 3:
> -			render_clock = 320; sampler_clock = 333;
> -			break;
> -		case 4:
> -			render_clock = 400; sampler_clock = 444;
> -			break;
> -		case 5:
> -			render_clock = 500; sampler_clock = 533;
> -			break;
> -		}
> -
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("sampler", sampler_clock);
> -	} else if (IS_945(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +	while ((dent = readdir(d)) != NULL) {
> +		const char *endswith = "-busy";
> +		const unsigned int endlen = strlen(endswith);
> +		struct engine *engine =
> +				engine_ptr(engines, engines->num_engines);
> +		char buf[256];
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		if (dent->d_type != DT_REG)
> +			continue;
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 166;
> -			break;
> -		case 1:
> -			render_clock = 200;
> -			break;
> -		case 3:
> -			render_clock = 250;
> -			break;
> -		case 5:
> -			render_clock = 400;
> +		if (strlen(dent->d_name) >= sizeof(buf)) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 200;
> -			break;
> -		case 4:
> -			display_clock = 320;
> -			break;
> -		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +		strcpy(buf, dent->d_name);
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	} else if (IS_915(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		/* xxxN-busy */
> +		if (strlen(buf) < (endlen + 4))
> +			continue;
> +		if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +			continue;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		memset(engine, 0, sizeof(*engine));
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 160;
> -			break;
> -		case 1:
> -			render_clock = 190;
> -			break;
> -		case 4:
> -			render_clock = 333;
> +		buf[strlen(buf) - endlen] = 0;
> +		engine->name = strdup(buf);
> +		if (!engine->name) {
> +			ret = -1;
>   			break;
>   		}
> -		if (gcfgc & (1 << 13))
> -		    render_clock = 133;
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 190;
> +		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +						     "busy");
> +		if (engine->busy.config == -1) {
> +			ret = -1;
>   			break;
> -		case 4:
> -			display_clock = 333;
> +		}
> +
> +		engines->num_engines++;
> +		engines = realloc(engines, sizeof(struct engines) +
> +				  engines->num_engines * sizeof(struct engine));
> +		if (!engines) {
> +			ret = -ENOMEM;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	}
> +
> +	if (ret)
> +		free(engines);
> +	else {
> +		qsort(engine_ptr(engines, 0), engines->num_engines,
> +		      sizeof(struct engine), engine_cmp);
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> +		engines->root = d;
>   	}
>   
> +	return ret == 0 ? engines : NULL;
> +}
> +
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +	int fd;
> +	ssize_t ret;
> +
> +	fd = open(filename, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
> +
> +	ret = read(fd, buf, bufsize - 1);
> +	close(fd);
> +	if (ret < 1)
> +		return -1;
> +
> +	buf[ret] = '\0';
>   
> -	printf("\n");
> -	return -1;
> +	return 0;
>   }
>   
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
> +static uint64_t filename_to_u64(const char *filename, int base)
> +{
> +	char buf[64], *b;
>   
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	/*
> +	 * Handle both single integer and key=value formats by skipping
> +	 * leading non-digits.
> +	 */
> +	b = buf;
> +	while (*b && !isdigit(*b))
> +		b++;
> +
> +	return strtoull(b, NULL, base);
> +}
> +
> +static uint64_t rapl_type_id(void)
>   {
> -	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -	int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -	int i;
> +	return filename_to_u64("/sys/devices/power/type", 10);
> +}
>   
> -	for (i = bar_len; i >= 8; i -= 8) {
> -		printf("%s", bars[8]);
> -		cur_line_len++;
> -	}
> -	if (i) {
> -		printf("%s", bars[i]);
> -		cur_line_len++;
> -	}
> +static uint64_t rapl_gpu_power(void)
> +{
> +	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +	char *oldlocale;
> +	char buf[80];
> +	double v;
> +
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
>   
> -	/* NB: We can't use a field width with utf8 so we manually
> -	* guarantee a field with of 45 chars for any bar. */
> -	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> +	oldlocale = setlocale(LC_ALL, "C");
> +	v = strtod(buf, NULL);
> +	setlocale(LC_ALL, oldlocale);
> +
> +	return v;
>   }
>   
> -struct ring {
> -	const char *name;
> -	uint32_t mmio;
> -	int head, tail, size;
> -	uint64_t full;
> -	int idle;
> -};
> +static double rapl_gpu_power_scale(void)
> +{
> +	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
> +}
>   
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +#define __open_pmu(engines, pmu, idx) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
> +	if (fd__ >= 0) { \
> +		if ((engines)->fd == -1) \
> +			(engines)->fd = fd__; \
> +		(pmu)->idx = (idx)++; \
> +		(engines)->num_counters++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
>   {
> -	return INREG(ring->mmio + reg);
> +	unsigned int idx = 0;
> +	unsigned int i;
> +	int fd;
> +
> +	engines->fd = -1;
> +	engines->num_counters = 0;
> +
> +	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_req, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_act, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->irq.config = I915_PMU_INTERRUPTS;
> +	fd = __open_pmu(engines, &engines->irq, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +	fd = __open_pmu(engines, &engines->rc6, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +		struct {
> +			struct pmu_counter *pmu;
> +			const char *counter;
> +		} *cnt, counters[] = {
> +			{ .pmu = &engine->busy, .counter = "busy" },
> +			{ .pmu = &engine->wait, .counter = "wait" },
> +			{ .pmu = &engine->sema, .counter = "sema" },
> +			{ .pmu = NULL, .counter = NULL },
> +		};
> +
> +		for (cnt = counters; cnt->pmu; cnt++) {
> +			if (!cnt->pmu->config)
> +				cnt->pmu->config =
> +					get_pmu_config(dirfd(engines->root),
> +						       engine->name,
> +						       cnt->counter);
> +			fd = __open_pmu(engines, cnt->pmu, idx);
> +			if (fd < 0)
> +				return -1;
> +		}
> +	}
> +
> +	engines->rapl_scale = rapl_gpu_power_scale();
> +	if (engines->rapl_scale != NAN)
> +		engines->rapl_scale *= 1e3; /* from nano to micro */
> +	engines->rapl.config = rapl_gpu_power();
> +	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
> +	if (engines->rapl_fd < 0)
> +		return -1;
> +
> +	return 0;
>   }
>   
> -static void ring_init(struct ring *ring)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>   {
> -	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +	uint64_t buf[2 + num];
> +	unsigned int i;
> +	ssize_t len;
> +
> +	memset(buf, 0, sizeof(buf));
> +
> +	len = read(fd, buf, sizeof(buf));
> +	assert(len == sizeof(buf));
> +
> +	for (i = 0; i < num; i++)
> +		val[i] = buf[2 + i];
> +
> +	return buf[1];
>   }
>   
> -static void ring_reset(struct ring *ring)
> +static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
>   {
> -	ring->idle = ring->full = 0;
> +	double pct;
> +
> +	pct = p->cur - p->prev;
> +	pct /= d;
> +	pct /= t;
> +	pct *= s;
> +
> +	if (s == 100.0 && pct > 100.0)
> +		pct = 100.0;
> +
> +	return pct;
>   }
>   
> -static void ring_sample(struct ring *ring)
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>   {
> -	int full;
> +	uint64_t data[2] = { };
> +	ssize_t len;
>   
> -	if (!ring->size)
> -		return;
> +	len = read(fd, data, sizeof(data));
> +	assert(len == sizeof(data));
>   
> -	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +	if (ts)
> +		*ts = data[1];
> +
> +	return data[0];
> +}
>   
> -	if (ring->tail == ring->head)
> -		ring->idle++;
> +static uint64_t pmu_read_single(int fd)
> +{
> +	return __pmu_read_single(fd, NULL);
> +}
>   
> -	full = ring->tail - ring->head;
> -	if (full < 0)
> -		full += ring->size;
> -	ring->full += full;
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
> +{
> +	counter->val.prev = counter->val.cur;
> +	counter->val.cur = val;
>   }
>   
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>   {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +	__update_sample(counter, val[counter->idx]);
>   }
>   
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void pmu_sample(struct engines *engines)
>   {
> -	int percent_busy, len;
> +	const int num_val = engines->num_counters;
> +	uint64_t val[num_val];
> +	unsigned int i;
> +
> +	engines->ts.prev = engines->ts.cur;
> +	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
> +
> +	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>   
> -	if (!ring->size)
> -		return;
> +	update_sample(&engines->freq_req, val);
> +	update_sample(&engines->freq_act, val);
> +	update_sample(&engines->irq, val);
> +	update_sample(&engines->rc6, val);
>   
> -	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
>   
> -	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -	print_percentage_bar (percent_busy, len);
> -	printf("%24s space: %d/%d\n",
> -		   ring->name,
> -		   (int)(ring->full / samples_per_sec),
> -		   ring->size);
> +		update_sample(&engine->busy, val);
> +		update_sample(&engine->sema, val);
> +		update_sample(&engine->wait, val);
> +	}
>   }
>   
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -		FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>   {
> -	if (ring->size)
> -		fprintf(output, "%3d\t%d\t",
> -			(int)(100 - 100 * ring->idle / samples_per_sec),
> -			(int)(ring->full / samples_per_sec));
> -	else
> -		fprintf(output, "-1\t-1\t");
> +	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +	int i;
> +
> +	putchar('|');
> +
> +	for (i = bar_len; i >= 8; i -= 8)
> +		printf("%s", bars[8]);
> +	if (i)
> +		printf("%s", bars[i]);
> +
> +	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +		putchar(' ');
> +
> +	putchar('|');
>   }
>   
> +#define DEFAULT_PERIOD_MS (1000)
> +
>   static void
>   usage(const char *appname)
>   {
>   	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -			"\n"
> -			"usage: %s [parameters]\n"
> -			"\n"
> -			"The following parameters apply:\n"
> -			"[-s <samples>]       samples per seconds (default %d)\n"
> -			"[-e <command>]       command to profile\n"
> -			"[-o <file>]          output statistics to file. If file is '-',"
> -			"                     run in batch mode and output statistics to stdio only \n"
> -			"[-h]                 show this help screen\n"
> -			"\n",
> -			appname,
> -			SAMPLES_PER_SEC
> -		  );
> -	return;
> +		"\n"
> +		"Usage: %s [parameters]\n"
> +		"\n"
> +		"\tThe following parameters are optional:\n"
> +		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
> +		"\t[-h]                 show this help text\n"
> +		"\n",
> +		appname, DEFAULT_PERIOD_MS);
>   }
>   
>   int main(int argc, char **argv)
>   {
> -	uint32_t devid;
> -	struct pci_device *pci_dev;
> -	struct ring render_ring = {
> -		.name = "render",
> -		.mmio = 0x2030,
> -	}, bsd_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x4030,
> -	}, bsd6_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x12030,
> -	}, blt_ring = {
> -		.name = "blitter",
> -		.mmio = 0x22030,
> -	};
> -	int i, ch;
> -	int samples_per_sec = SAMPLES_PER_SEC;
> -	FILE *output = NULL;
> -	double elapsed_time=0;
> -	int print_headers=1;
> -	pid_t child_pid=-1;
> -	int child_stat;
> -	char *cmd=NULL;
> -	int interactive=1;
> -
> -	/* Parse options? */
> -	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +	int con_w = -1, con_h = -1;
> +	struct engines *engines;
> +	unsigned int i;
> +	int ret, ch;
> +
> +	/* Parse options */
> +	while ((ch = getopt(argc, argv, "s:h")) != -1) {
>   		switch (ch) {
> -		case 'e': cmd = strdup(optarg);
> -			break;
> -		case 's': samples_per_sec = atoi(optarg);
> -			if (samples_per_sec < 100) {
> -				fprintf(stderr, "Error: samples per second must be >= 100\n");
> -				exit(1);
> -			}
> -			break;
> -		case 'o':
> -			if (!strcmp(optarg, "-")) {
> -				/* Running in non-interactive mode */
> -				interactive = 0;
> -				output = stdout;
> -			}
> -			else
> -				output = fopen(optarg, "w");
> -			if (!output)
> -			{
> -				perror("fopen");
> -				exit(1);
> -			}
> +		case 's':
> +			period_us = atoi(optarg) * 1000;
>   			break;
>   		case 'h':
>   			usage(argv[0]);
>   			exit(0);
> -			break;
>   		default:
> -			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>   			usage(argv[0]);
>   			exit(1);
> -			break;
>   		}
>   	}
>   
> -	pci_dev = intel_get_pci_device();
> -	devid = pci_dev->device_id;
> -	intel_mmio_use_pci_bar(pci_dev);
> -	init_instdone_definitions(devid);
> -
> -	/* Do we have a command to run? */
> -	if (cmd != NULL) {
> -		if (output) {
> -			fprintf(output, "# Profiling: %s\n", cmd);
> -			fflush(output);
> -		}
> -		child_pid = fork();
> -		if (child_pid < 0) {
> -			perror("fork");
> -			exit(1);
> -		}
> -		else if (child_pid == 0) {
> -			int res;
> -			res = system(cmd);
> -			if (res < 0)
> -				perror("running command");
> -			if (output) {
> -				fflush(output);
> -				fprintf(output, "# %s exited with status %d\n", cmd, res);
> -				fflush(output);
> -			}
> -			free(cmd);
> -			exit(0);
> -		} else {
> -			free(cmd);
> -		}
> +	engines = discover_engines();
> +	if (!engines) {
> +		fprintf(stderr, "Failed to detect engines!\n");
> +		return 1;
>   	}
>   
> -	for (i = 0; i < num_instdone_bits; i++) {
> -		top_bits[i].bit = &instdone_bits[i];
> -		top_bits[i].count = 0;
> -		top_bits_sorted[i] = &top_bits[i];
> +	ret = pmu_init(engines);
> +	if (ret) {
> +		fprintf(stderr, "Failed to initialize PMU!\n");
> +		return 1;
>   	}
>   
> -	/* Grab access to the registers */
> -	intel_register_access_init(pci_dev, 0, -1);
> +	pmu_sample(engines);
>   
> -	ring_init(&render_ring);
> -	if (IS_GEN4(devid) || IS_GEN5(devid))
> -		ring_init(&bsd_ring);
> -	if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -		ring_init(&bsd6_ring);
> -		ring_init(&blt_ring);
> -	}
> +	for (;;) {
> +		double t, freq[2], irq, rc6, power;
> +		struct winsize ws;
> +		int lines = 0;
>   
> -	/* Initialize GPU stats */
> -	if (HAS_STATS_REGS(devid)) {
> -		for (i = 0; i < STATS_COUNT; i++) {
> -			uint32_t stats_high, stats_low, stats_high_2;
> +		/* Update terminal size. */
> +		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +			con_w = ws.ws_col;
> +			con_h = ws.ws_row;
> +		}
>   
> -			do {
> -				stats_high = INREG(stats_regs[i] + 4);
> -				stats_low = INREG(stats_regs[i]);
> -				stats_high_2 = INREG(stats_regs[i] + 4);
> -			} while (stats_high != stats_high_2);
> +		pmu_sample(engines);
> +		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>   
> -			last_stats[i] = (uint64_t)stats_high << 32 |
> -				stats_low;
> -		}
> -	}
> +		printf("\033[H\033[J");
>   
> -	for (;;) {
> -		int j;
> -		unsigned long long t1, ti, tf, t2;
> -		unsigned long long def_sleep = 1000000 / samples_per_sec;
> -		unsigned long long last_samples_per_sec = samples_per_sec;
> -		unsigned short int max_lines;
> -		struct winsize ws;
> -		char clear_screen[] = {0x1b, '[', 'H',
> -				       0x1b, '[', 'J',
> -				       0x0};
> -		int percent;
> -		int len;
> -
> -		t1 = gettime();
> -
> -		ring_reset(&render_ring);
> -		ring_reset(&bsd_ring);
> -		ring_reset(&bsd6_ring);
> -		ring_reset(&blt_ring);
> -
> -		for (i = 0; i < samples_per_sec; i++) {
> -			long long interval;
> -			ti = gettime();
> -			if (IS_965(devid)) {
> -				instdone = INREG(INSTDONE_I965);
> -				instdone1 = INREG(INSTDONE_1);
> -			} else
> -				instdone = INREG(INSTDONE);
> -
> -			for (j = 0; j < num_instdone_bits; j++)
> -				update_idle_bit(&top_bits[j]);
> -
> -			ring_sample(&render_ring);
> -			ring_sample(&bsd_ring);
> -			ring_sample(&bsd6_ring);
> -			ring_sample(&blt_ring);
> -
> -			tf = gettime();
> -			if (tf - t1 >= 1000000) {
> -				/* We are out of sync, bail out */
> -				last_samples_per_sec = i+1;
> -				break;
> -			}
> -			interval = def_sleep - (tf - ti);
> -			if (interval > 0)
> -				usleep(interval);
> -		}
> +		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
> +		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
> +		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
> +		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
> +		power = pmu_calc(&engines->rapl.val, 1.0, t,
> +				 engines->rapl_scale);
>   
> -		if (HAS_STATS_REGS(devid)) {
> -			for (i = 0; i < STATS_COUNT; i++) {
> -				uint32_t stats_high, stats_low, stats_high_2;
> +		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
> +		       freq[0], freq[1], rc6, power, irq);
> +		lines++;
>   
> -				do {
> -					stats_high = INREG(stats_regs[i] + 4);
> -					stats_low = INREG(stats_regs[i]);
> -					stats_high_2 = INREG(stats_regs[i] + 4);
> -				} while (stats_high != stats_high_2);
> +		printf("\n");
> +		lines++;
>   
> -				stats[i] = (uint64_t)stats_high << 32 |
> -					stats_low;
> -			}
> -		}
> +		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
> +			unsigned int max_w = con_w - 1;
> +			unsigned int len;
> +			double val[2];
> +			char buf[128];
>   
> -		qsort(top_bits_sorted, num_instdone_bits,
> -		      sizeof(struct top_bit *), top_bits_sort);
> -
> -		/* Limit the number of lines printed to the terminal height so the
> -		 * most important info (at the top) will stay on screen. */
> -		max_lines = -1;
> -		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -			max_lines = ws.ws_row - 6; /* exclude header lines */
> -		if (max_lines >= num_instdone_bits)
> -			max_lines = num_instdone_bits;
> -
> -		t2 = gettime();
> -		elapsed_time += (t2 - t1) / 1000000.0;
> -
> -		if (interactive) {
> -			printf("%s", clear_screen);
> -			print_clock_info(pci_dev);
> -
> -			ring_print(&render_ring, last_samples_per_sec);
> -			ring_print(&bsd_ring, last_samples_per_sec);
> -			ring_print(&bsd6_ring, last_samples_per_sec);
> -			ring_print(&blt_ring, last_samples_per_sec);
> -
> -			printf("\n%30s  %s\n", "task", "percent busy");
> -			for (i = 0; i < max_lines; i++) {
> -				if (top_bits_sorted[i]->count > 0) {
> -					percent = (top_bits_sorted[i]->count * 100) /
> -						last_samples_per_sec;
> -					len = printf("%30s: %3d%%: ",
> -							 top_bits_sorted[i]->bit->name,
> -							 percent);
> -					print_percentage_bar (percent, len);
> -				} else {
> -					printf("%*s", PERCENTAGE_BAR_END, "");
> -				}
> -
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					printf("%13s: %llu (%lld/sec)",
> -						   stats_reg_names[i],
> -						   (long long)stats[i],
> -						   (long long)(stats[i] - last_stats[i]));
> -					last_stats[i] = stats[i];
> -				} else {
> -					if (!top_bits_sorted[i]->count)
> -						break;
> -				}
> -				printf("\n");
> -			}
> -		}
> -		if (output) {
> -			/* Print headers for columns at first run */
> -			if (print_headers) {
> -				fprintf(output, "# time\t");
> -				ring_print_header(output, &render_ring);
> -				ring_print_header(output, &bsd_ring);
> -				ring_print_header(output, &bsd6_ring);
> -				ring_print_header(output, &blt_ring);
> -				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -						fprintf(output, "%.6s\t",
> -							   stats_reg_names[i]
> -							   );
> -					}
> -					if (!top_bits[i].count)
> -						continue;
> -				}
> -				fprintf(output, "\n");
> -				print_headers = 0;
> -			}
> -
> -			/* Print statistics */
> -			fprintf(output, "%.2f\t", elapsed_time);
> -			ring_log(&render_ring, last_samples_per_sec, output);
> -			ring_log(&bsd_ring, last_samples_per_sec, output);
> -			ring_log(&bsd6_ring, last_samples_per_sec, output);
> -			ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					fprintf(output, "%"PRIu64"\t",
> -						   stats[i] - last_stats[i]);
> -					last_stats[i] = stats[i];
> -				}
> -					if (!top_bits[i].count)
> -						continue;
> -			}
> -			fprintf(output, "\n");
> -			fflush(output);
> -		}
> +			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
> +			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
> +			len = snprintf(buf, sizeof(buf),
> +				       "%6.2f%% wait, %6.2f%% sema",
> +				       val[0], val[1]);
>   
> -		for (i = 0; i < num_instdone_bits; i++) {
> -			top_bits_sorted[i]->count = 0;
> +			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
> +			len += printf("%8s %6.2f%% ",
> +				      engine->name, val[0]);
> +			print_percentage_bar(val[0], max_w - len);
>   
> -			if (i < STATS_COUNT)
> -				last_stats[i] = stats[i];
> -		}
> +			printf("%s\n", buf);
>   
> -		/* Check if child has gone */
> -		if (child_pid > 0) {
> -			int res;
> -			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -				perror("waitpid");
> -				exit(1);
> -			}
> -			if (res == 0)
> -				continue;
> -			if (WIFEXITED(child_stat))
> -				break;
> +			lines++;
>   		}
> -	}
>   
> -	fclose(output);
> +		printf("\n");
> +
> +		usleep(period_us);
> +	}
>   
> -	intel_register_access_fini();
>   	return 0;
>   }
> diff --git a/tools/meson.build b/tools/meson.build
> index bd2d313d5156..a918eeb0bef1 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -23,7 +23,6 @@ tools_progs = [
>   	'intel_gpu_frequency',
>   	'intel_firmware_decode',
>   	'intel_gpu_time',
> -	'intel_gpu_top',
>   	'intel_gtt',
>   	'intel_guc_logger',
>   	'intel_infoframes',
> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>   	       name_prefix : '',
>   	       install : true)
>   
> +executable('intel_gpu_top', 'intel_gpu_top.c',
> +	   install : true,
> +	   install_rpath : rpathdir,
> +	   dependencies : tool_deps + [ lib_igt_perf ])
> +
>   conf_data = configuration_data()
>   conf_data.set('prefix', prefix)
>   conf_data.set('exec_prefix', '${prefix}')
> 

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for intel-gpu-top: Rewrite the tool to be safe to use (rev2)
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (7 preceding siblings ...)
  (?)
@ 2018-03-29 15:59 ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-03-29 15:59 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use (rev2)
URL   : https://patchwork.freedesktop.org/series/40826/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
2cbd4ddf11b3eaf01f33d8bc2ad46411ec6c299a lib/igt_kms: Improve connector probing in igt_display_init(), v2.

with latest DRM-Tip kernel build CI_DRM_4007
d6e43ca115e5 drm-tip: 2018y-03m-29d-12h-46m-03s UTC integration manifest

No testlist changes.

---- Known issues:

Test gvt_basic:
        Subgroup invalid-placeholder-test:
                skip       -> INCOMPLETE (fi-cnl-y3) fdo#105777
Test kms_pipe_crc_basic:
        Subgroup read-crc-pipe-b-frame-sequence:
                fail       -> PASS       (fi-cfl-s3) fdo#103481

fdo#105777 https://bugs.freedesktop.org/show_bug.cgi?id=105777
fdo#103481 https://bugs.freedesktop.org/show_bug.cgi?id=103481

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:429s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:444s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:383s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:538s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:298s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:516s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:515s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:522s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:510s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:409s
fi-cfl-s3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:560s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:514s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:25 
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:424s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:320s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:542s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:408s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:423s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:472s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:434s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:472s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:466s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:517s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:663s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:438s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:534s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:506s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:509s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:431s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:447s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:400s
Blacklisted hosts:
fi-cnl-psr       total:285  pass:256  dwarn:3   dfail:0   fail:0   skip:26  time:526s
fi-glk-j4005     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:486s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1208/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-29 10:33   ` [igt-dev] " Tvrtko Ursulin
@ 2018-03-29 16:27     ` Chris Wilson
  -1 siblings, 0 replies; 57+ messages in thread
From: Chris Wilson @ 2018-03-29 16:27 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Rinat Ibragimov, Intel-gfx, Eero Tamminen

Quoting Tvrtko Ursulin (2018-03-29 11:33:34)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.

intel-gpu-top -  750/ 581 MHz;    0% RC6;   6211mW;    98952 irqs/s

                 ^ easier as actual/request.  Maybe just actual?

    rcs0  97.58% |█████████████████████████████████▏|  0.00% wait,   0.00% sema
    bcs0  92.87% |███████████████████████████████▌  |  0.00% wait,   0.00% sema
    vcs0  92.70% |███████████████████████████████▌  |  0.00% wait,   0.00% sema
   vecs0  92.90% |███████████████████████████████▌  |  0.00% wait,   0.00% sema

2 decimal places seem ok for busyness, but for wait/sema seem overkill.
Drop down to 1dp for them (or all), or even 0dp. Don't fancy overlaying
them on the bar as a different colour block? ;) Hmm, already using ANSI
so you could...
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-29 16:27     ` Chris Wilson
  0 siblings, 0 replies; 57+ messages in thread
From: Chris Wilson @ 2018-03-29 16:27 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev; +Cc: Tvrtko Ursulin, Intel-gfx, Eero Tamminen

Quoting Tvrtko Ursulin (2018-03-29 11:33:34)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.

intel-gpu-top -  750/ 581 MHz;    0% RC6;   6211mW;    98952 irqs/s

                 ^ easier as actual/request.  Maybe just actual?

    rcs0  97.58% |█████████████████████████████████▏|  0.00% wait,   0.00% sema
    bcs0  92.87% |███████████████████████████████▌  |  0.00% wait,   0.00% sema
    vcs0  92.70% |███████████████████████████████▌  |  0.00% wait,   0.00% sema
   vecs0  92.90% |███████████████████████████████▌  |  0.00% wait,   0.00% sema

2 decimal places seem ok for busyness, but for wait/sema seem overkill.
Drop down to 1dp for them (or all), or even 0dp. Don't fancy overlaying
them on the bar as a different colour block? ;) Hmm, already using ANSI
so you could...
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH i-g-t v3] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-29 14:30     ` [Intel-gfx] " Eero Tamminen
@ 2018-03-29 18:46       ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-29 18:46 UTC (permalink / raw)
  To: igt-dev; +Cc: Rinat Ibragimov, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibgragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1216 ++++++++++++++++++++++++++-----------------------
 tools/meson.build     |    6 +-
 5 files changed, 657 insertions(+), 574 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..46a3db42d3d6 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -18,701 +17,772 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) \
+	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+	if (ret)
+		free(engines);
+	else {
+		qsort(engine_ptr(engines, 0), engines->num_engines,
+		      sizeof(struct engine), engine_cmp);
+
+		engines->root = d;
 	}
 
+	return ret == 0 ? engines : NULL;
+}
+
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
+
+	buf[ret] = '\0';
 
-	printf("\n");
-	return -1;
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char *oldlocale;
+	char buf[80];
+	double v;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
-	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
-	}
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
+	return v;
 }
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+static uint64_t rapl_gpu_power(void)
 {
-	return INREG(ring->mmio + reg);
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
 }
 
-static void ring_init(struct ring *ring)
+static double rapl_gpu_power_scale(void)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
 }
 
-static void ring_reset(struct ring *ring)
+static uint64_t imc_type_id(void)
 {
-	ring->idle = ring->full = 0;
+	return filename_to_u64("/sys/devices/uncore_imc/type", 10);
 }
 
-static void ring_sample(struct ring *ring)
+static uint64_t imc_data_reads(void)
 {
-	int full;
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_reads", 0);
+}
 
-	if (!ring->size)
-		return;
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double("/sys/devices/uncore_imc/events/data_reads.scale");
+}
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_reads.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_writes", 0);
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static double imc_data_writes_scale(void)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return filename_to_double("/sys/devices/uncore_imc/events/data_writes.scale");
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static const char *imc_data_writes_unit(void)
 {
-	int percent_busy, len;
+	char buf[32];
+
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
 
-	if (!ring->size)
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+			bool optional;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema", .optional = true },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd < 0 && !cnt->optional)
+				return -1;
+		}
+	}
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		if (engines->rapl_scale != NAN)
+			engines->rapl_scale *= 1e3; /* from nano to micro */
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
+	}
+
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
+
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
+
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
+
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
+{
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
+}
+
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
+{
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz, const char *fmt,
+		     double d, double t, double s)
+{
+	double val;
+	int len;
+
+	if (!cnt->present) {
+		strncpy(buf, "---", bufsz);
 		return;
+	}
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	val = __pmu_calc(&cnt->val, d, t, s);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	len = snprintf(buf, bufsz, fmt, val);
+	if (len < 0 || len == bufsz) {
+		strncpy(buf, "XXX", bufsz);
+		return;
+	}
 }
+#pragma GCC diagnostic pop
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	uint64_t data[2] = { };
+	ssize_t len;
+
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
+
+	if (ts)
+		*ts = data[1];
+
+	return data[0];
 }
 
+static uint64_t pmu_read_single(int fd)
+{
+	return __pmu_read_single(fd, NULL);
+}
+
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
+{
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
+
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
+
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
+}
+
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
+{
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
+}
+
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
-	}
-
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
-
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
-
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
-
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+	pmu_sample(engines);
 
 	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
 		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
-
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		int lines = 0;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
-
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
 		}
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
+
+		printf("\033[H\033[J");
+
+		pmu_calc(&engines->freq_req, freq, BUFSZ, "%4.0f", 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, "%4.0f", 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, "%8.0f", 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, "%3.0f", 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, "%6.0f", 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, "%6.0f", 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, "%6.0f", 1.0, t,
+			 engines->imc_writes_scale);
+
+		printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %smW; %s irqs/s\n",
+		       fact, freq, rc6, power, irq);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC reads:", reads, engines->imc_reads_unit);
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC writes:", writes, engines->imc_writes_unit);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			pmu_calc(&engine->sema, sema, BUFSZ, "%3.0f", 1e9, t,
+				 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, "%3.0f", 1e9, t,
+				 100);
+			len = snprintf(buf, sizeof(buf), "%s%% sema, %s%% wait",
+				       sema, wait);
+
+			pmu_calc(&engine->busy, busy, BUFSZ, "%6.2f", 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
+
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
+
+			printf("%s\n", buf);
+
+			lines++;
 		}
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
-
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
-		}
+		printf("\n");
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
-		}
+		usleep(period_us);
 	}
 
-	fclose(output);
-
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [igt-dev] [PATCH i-g-t v3] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-03-29 18:46       ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-03-29 18:46 UTC (permalink / raw)
  To: igt-dev; +Cc: Tvrtko Ursulin, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibgragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1216 ++++++++++++++++++++++++++-----------------------
 tools/meson.build     |    6 +-
 5 files changed, 657 insertions(+), 574 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..46a3db42d3d6 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -18,701 +17,772 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Eric Anholt <eric@anholt.net>
- *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) \
+	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+	if (ret)
+		free(engines);
+	else {
+		qsort(engine_ptr(engines, 0), engines->num_engines,
+		      sizeof(struct engine), engine_cmp);
+
+		engines->root = d;
 	}
 
+	return ret == 0 ? engines : NULL;
+}
+
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
+
+	buf[ret] = '\0';
 
-	printf("\n");
-	return -1;
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char *oldlocale;
+	char buf[80];
+	double v;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
-	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
-	}
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
+	return v;
 }
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+static uint64_t rapl_gpu_power(void)
 {
-	return INREG(ring->mmio + reg);
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
 }
 
-static void ring_init(struct ring *ring)
+static double rapl_gpu_power_scale(void)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
 }
 
-static void ring_reset(struct ring *ring)
+static uint64_t imc_type_id(void)
 {
-	ring->idle = ring->full = 0;
+	return filename_to_u64("/sys/devices/uncore_imc/type", 10);
 }
 
-static void ring_sample(struct ring *ring)
+static uint64_t imc_data_reads(void)
 {
-	int full;
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_reads", 0);
+}
 
-	if (!ring->size)
-		return;
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double("/sys/devices/uncore_imc/events/data_reads.scale");
+}
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_reads.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_writes", 0);
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static double imc_data_writes_scale(void)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return filename_to_double("/sys/devices/uncore_imc/events/data_writes.scale");
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static const char *imc_data_writes_unit(void)
 {
-	int percent_busy, len;
+	char buf[32];
+
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
 
-	if (!ring->size)
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+			bool optional;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema", .optional = true },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd < 0 && !cnt->optional)
+				return -1;
+		}
+	}
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		if (engines->rapl_scale != NAN)
+			engines->rapl_scale *= 1e3; /* from nano to micro */
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
+	}
+
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
+
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
+
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
+
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
+{
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
+}
+
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
+{
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz, const char *fmt,
+		     double d, double t, double s)
+{
+	double val;
+	int len;
+
+	if (!cnt->present) {
+		strncpy(buf, "---", bufsz);
 		return;
+	}
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	val = __pmu_calc(&cnt->val, d, t, s);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	len = snprintf(buf, bufsz, fmt, val);
+	if (len < 0 || len == bufsz) {
+		strncpy(buf, "XXX", bufsz);
+		return;
+	}
 }
+#pragma GCC diagnostic pop
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	uint64_t data[2] = { };
+	ssize_t len;
+
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
+
+	if (ts)
+		*ts = data[1];
+
+	return data[0];
 }
 
+static uint64_t pmu_read_single(int fd)
+{
+	return __pmu_read_single(fd, NULL);
+}
+
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
+{
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
+
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
+
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
+}
+
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
+{
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
+}
+
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
-	}
-
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
-
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
-
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
-
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+	pmu_sample(engines);
 
 	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
 		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
-
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		int lines = 0;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
-
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
 		}
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
+
+		printf("\033[H\033[J");
+
+		pmu_calc(&engines->freq_req, freq, BUFSZ, "%4.0f", 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, "%4.0f", 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, "%8.0f", 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, "%3.0f", 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, "%6.0f", 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, "%6.0f", 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, "%6.0f", 1.0, t,
+			 engines->imc_writes_scale);
+
+		printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %smW; %s irqs/s\n",
+		       fact, freq, rc6, power, irq);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC reads:", reads, engines->imc_reads_unit);
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC writes:", writes, engines->imc_writes_unit);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			pmu_calc(&engine->sema, sema, BUFSZ, "%3.0f", 1e9, t,
+				 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, "%3.0f", 1e9, t,
+				 100);
+			len = snprintf(buf, sizeof(buf), "%s%% sema, %s%% wait",
+				       sema, wait);
+
+			pmu_calc(&engine->busy, busy, BUFSZ, "%6.2f", 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
+
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
+
+			printf("%s\n", buf);
+
+			lines++;
 		}
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
-
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
-		}
+		printf("\n");
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
-		}
+		usleep(period_us);
 	}
 
-	fclose(output);
-
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for intel-gpu-top: Rewrite the tool to be safe to use (rev3)
  2018-03-29 14:30     ` [Intel-gfx] " Eero Tamminen
  (?)
  (?)
@ 2018-03-29 19:10     ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-03-29 19:10 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use (rev3)
URL   : https://patchwork.freedesktop.org/series/40826/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
2cbd4ddf11b3eaf01f33d8bc2ad46411ec6c299a lib/igt_kms: Improve connector probing in igt_display_init(), v2.

with latest DRM-Tip kernel build CI_DRM_4008
befd0b655b91 drm-tip: 2018y-03m-29d-16h-19m-32s UTC integration manifest

No testlist changes.

---- Known issues:

Test drv_module_reload:
        Subgroup basic-reload-inject:
                pass       -> INCOMPLETE (fi-cnl-y3) fdo#105777
Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                pass       -> FAIL       (fi-gdg-551) fdo#102575
Test kms_chamelium:
        Subgroup dp-crc-fast:
                pass       -> DMESG-FAIL (fi-kbl-7500u) fdo#103841
Test prime_vgem:
        Subgroup basic-fence-flip:
                fail       -> PASS       (fi-ilk-650) fdo#104008

fdo#105777 https://bugs.freedesktop.org/show_bug.cgi?id=105777
fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575
fdo#103841 https://bugs.freedesktop.org/show_bug.cgi?id=103841
fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:429s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:445s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:386s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:541s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:298s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:518s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:523s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:513s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:411s
fi-cfl-s3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:560s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:512s
fi-cnl-y3        total:284  pass:258  dwarn:0   dfail:0   fail:0   skip:25 
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:416s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:320s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:539s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:405s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:427s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:462s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:433s
fi-kbl-7500u     total:285  pass:259  dwarn:1   dfail:1   fail:0   skip:24  time:476s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:460s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:510s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:658s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:449s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:535s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:503s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:496s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:428s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:452s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:602s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:398s
Blacklisted hosts:
fi-cnl-psr       total:285  pass:255  dwarn:3   dfail:0   fail:1   skip:26  time:524s
fi-glk-j4005     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:489s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1210/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for intel-gpu-top: Rewrite the tool to be safe to use (rev2)
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (8 preceding siblings ...)
  (?)
@ 2018-03-29 20:23 ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-03-29 20:23 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use (rev2)
URL   : https://patchwork.freedesktop.org/series/40826/
State : failure

== Summary ==

---- Possible new issues:

Test kms_flip:
        Subgroup flip-vs-dpms-off-vs-modeset-interruptible:
                pass       -> DMESG-WARN (shard-hsw)
Test kms_frontbuffer_tracking:
        Subgroup fbc-1p-primscrn-pri-shrfb-draw-pwrite:
                pass       -> DMESG-WARN (shard-hsw)
        Subgroup fbc-1p-primscrn-shrfb-plflip-blt:
                pass       -> DMESG-FAIL (shard-apl)
Test pm_rc6_residency:
        Subgroup rc6-accuracy:
                skip       -> PASS       (shard-snb)

---- Known issues:

Test kms_cursor_legacy:
        Subgroup 2x-long-flip-vs-cursor-atomic:
                pass       -> FAIL       (shard-hsw) fdo#104873
Test kms_flip:
        Subgroup 2x-dpms-vs-vblank-race-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#103060
        Subgroup 2x-flip-vs-absolute-wf_vblank:
                pass       -> FAIL       (shard-hsw) fdo#100368 +1
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-a:
                incomplete -> PASS       (shard-hsw) fdo#103375
Test kms_rotation_crc:
        Subgroup sprite-rotation-180:
                fail       -> PASS       (shard-snb) fdo#103925

fdo#104873 https://bugs.freedesktop.org/show_bug.cgi?id=104873
fdo#103060 https://bugs.freedesktop.org/show_bug.cgi?id=103060
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103375 https://bugs.freedesktop.org/show_bug.cgi?id=103375
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925

shard-apl        total:3495 pass:1830 dwarn:1   dfail:1   fail:7   skip:1655 time:12927s
shard-hsw        total:3495 pass:1777 dwarn:3   dfail:0   fail:5   skip:1709 time:11550s
shard-snb        total:3495 pass:1376 dwarn:1   dfail:0   fail:2   skip:2116 time:7064s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1208/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: warning for intel-gpu-top: Rewrite the tool to be safe to use (rev3)
  2018-03-29 14:30     ` [Intel-gfx] " Eero Tamminen
                       ` (2 preceding siblings ...)
  (?)
@ 2018-03-29 23:29     ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-03-29 23:29 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use (rev3)
URL   : https://patchwork.freedesktop.org/series/40826/
State : warning

== Summary ==

---- Possible new issues:

Test kms_cursor_legacy:
        Subgroup 2x-nonblocking-modeset-vs-cursor-atomic:
                pass       -> SKIP       (shard-hsw)
Test kms_frontbuffer_tracking:
        Subgroup psr-2p-scndscrn-cur-indfb-draw-mmap-gtt:
                fail       -> SKIP       (shard-snb)

---- Known issues:

Test kms_chv_cursor_fail:
        Subgroup pipe-c-128x128-right-edge:
                pass       -> FAIL       (shard-apl) fdo#104671
Test kms_cursor_legacy:
        Subgroup flip-vs-cursor-toggle:
                fail       -> PASS       (shard-hsw) fdo#102670
Test kms_flip:
        Subgroup 2x-flip-vs-expired-vblank:
                fail       -> PASS       (shard-hsw) fdo#102887 +1
        Subgroup plain-flip-fb-recreate-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#100368
Test kms_plane_multiple:
        Subgroup atomic-pipe-a-tiling-x:
                pass       -> FAIL       (shard-snb) fdo#103166
Test kms_rotation_crc:
        Subgroup sprite-rotation-180:
                fail       -> PASS       (shard-snb) fdo#103925
Test kms_sysfs_edid_timing:
                warn       -> PASS       (shard-apl) fdo#100047
Test perf:
        Subgroup blocking:
                fail       -> PASS       (shard-hsw) fdo#102252

fdo#104671 https://bugs.freedesktop.org/show_bug.cgi?id=104671
fdo#102670 https://bugs.freedesktop.org/show_bug.cgi?id=102670
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103166 https://bugs.freedesktop.org/show_bug.cgi?id=103166
fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
fdo#100047 https://bugs.freedesktop.org/show_bug.cgi?id=100047
fdo#102252 https://bugs.freedesktop.org/show_bug.cgi?id=102252

shard-apl        total:3495 pass:1831 dwarn:1   dfail:0   fail:8   skip:1655 time:12885s
shard-hsw        total:3495 pass:1780 dwarn:1   dfail:0   fail:3   skip:1710 time:11521s
shard-snb        total:3495 pass:1374 dwarn:1   dfail:0   fail:3   skip:2117 time:7032s
Blacklisted hosts:
shard-kbl        total:3401 pass:1895 dwarn:1   dfail:0   fail:7   skip:1495 time:8437s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1210/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH i-g-t v3] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-29 18:46       ` [igt-dev] " Tvrtko Ursulin
  (?)
@ 2018-03-30 19:15       ` Rinat Ibragimov
  2018-04-03  9:14           ` [Intel-gfx] " Tvrtko Ursulin
  2018-04-03  9:38           ` Tvrtko Ursulin
  -1 siblings, 2 replies; 57+ messages in thread
From: Rinat Ibragimov @ 2018-03-30 19:15 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Eero Tamminen, igt-dev, Intel-gfx



>Четверг, 29 марта 2018, 21:46 +03:00 от Tvrtko Ursulin <tursulin@ursulin.net>:
>

>+#define engine_ptr(engines, n) \
>+((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))

I think (&engines->engine + (n)) is easier to read.

>+if (fd < 0 && !cnt->optional)
>+return -1;

I've tried to run it on Skylake on Linux 4.16, and intel_gpu_top is working, as long as
I remove these lines. Otherwise it fails while trying "vcs1". Error message says about
Linux 4.16, which is a bit confusing.

There are code that sets and tests "present" field of struct pmu_counter. So,
I guess, it's fine to remove the code, and thus make all counters optional?

>+
>+if (!cnt->present) {
>+strncpy(buf, "---", bufsz);
> return;
>+}

If you decide to make all counters optional, this will be used for "busy" numbers
too. But "busy" is 6 characters wide, unlike "sema" and "wait", which are 3 each.


>-- 
>2.14.1
>

---
Rinat
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH i-g-t v4] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-30 19:15       ` Rinat Ibragimov
@ 2018-04-03  9:14           ` Tvrtko Ursulin
  2018-04-03  9:38           ` Tvrtko Ursulin
  1 sibling, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03  9:14 UTC (permalink / raw)
  To: igt-dev; +Cc: Rinat Ibragimov, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

v4:
 * Really support 4.16 by skipping of missing engines.
 * Simpler and less hacky float printing.
 * Preserve copyright header. (Antonio Argenziano)
 * Simplify engines_ptr macro. (Rinat Ibragimov)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1213 ++++++++++++++++++++++++++-----------------------
 tools/meson.build     |    6 +-
 5 files changed, 664 insertions(+), 564 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..2f5b7badbbad 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2007-2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,695 +23,783 @@
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+#include "igt_perf.h"
+
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	unsigned int num_counters;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) (&engines->engine + (n))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret)
+		free(engines);
+	else {
+		qsort(engine_ptr(engines, 0), engines->num_engines,
+		      sizeof(struct engine), engine_cmp);
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		engines->root = d;
 	}
 
+	return ret == 0 ? engines : NULL;
+}
+
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
 
-	printf("\n");
-	return -1;
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
+
+	buf[ret] = '\0';
+
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char *oldlocale;
+	char buf[80];
+	double v;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
+}
+
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
+
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
+}
+
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
+}
+
+static uint64_t imc_type_id(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/type", 10);
+}
+
+static uint64_t imc_data_reads(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_reads", 0);
+}
+
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double("/sys/devices/uncore_imc/events/data_reads.scale");
+}
+
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_reads.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_writes", 0);
+}
+
+static double imc_data_writes_scale(void)
+{
+	return filename_to_double("/sys/devices/uncore_imc/events/data_writes.scale");
+}
+
+static const char *imc_data_writes_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd >= 0)
+				engine->num_counters++;
+		}
 	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		if (engines->rapl_scale != NAN)
+			engines->rapl_scale *= 1e3; /* from nano to micro */
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
 	}
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
-}
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
+
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	return INREG(ring->mmio + reg);
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_init(struct ring *ring)
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
 }
 
-static void ring_reset(struct ring *ring)
+static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
 {
-	ring->idle = ring->full = 0;
+	unsigned int i;
+
+	for (i = 0; i < num && i < (bufsz - 1); i++)
+		*buf++ = c;
+
+	*buf = 0;
 }
 
-static void ring_sample(struct ring *ring)
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz,
+		     unsigned int width, unsigned width_dec,
+		     double d, double t, double s)
 {
-	int full;
+	double val;
+	int len;
+
+	assert(bufsz >= (width + width_dec + 1));
 
-	if (!ring->size)
+	if (!cnt->present) {
+		fill_str(buf, bufsz, '-', width + width_dec);
 		return;
+	}
+
+	val = __pmu_calc(&cnt->val, d, t, s);
+
+	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
+	if (len < 0 || len == bufsz) {
+		fill_str(buf, bufsz, 'X', width + width_dec);
+		return;
+	}
+}
+
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
+{
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (ts)
+		*ts = data[1];
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+	return data[0];
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static uint64_t pmu_read_single(int fd)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return __pmu_read_single(fd, NULL);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
 {
-	int percent_busy, len;
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
 
-	if (!ring->size)
-		return;
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	engines->ts.prev = engines->ts.cur;
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
+
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
-	}
-
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
-
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
-
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
-
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+	pmu_sample(engines);
 
 	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
 		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
+		int lines = 0;
+
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
 		}
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
+
+		printf("\033[H\033[J");
+
+		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, 6, 0, 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_writes_scale);
+
+		printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %smW; %s irqs/s\n",
+		       fact, freq, rc6, power, irq);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC reads:", reads, engines->imc_reads_unit);
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC writes:", writes, engines->imc_writes_unit);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			if (!engine->num_counters)
+				continue;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t,
+				 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t,
+				 100);
+			len = snprintf(buf, sizeof(buf), "%s%% sema, %s%% wait",
+				       sema, wait);
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			printf("%s\n", buf);
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
+			lines++;
 		}
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
-		}
-	}
+		printf("\n");
 
-	fclose(output);
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [Intel-gfx] [PATCH i-g-t v4] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-03  9:14           ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03  9:14 UTC (permalink / raw)
  To: igt-dev; +Cc: Rinat Ibragimov, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

v4:
 * Really support 4.16 by skipping of missing engines.
 * Simpler and less hacky float printing.
 * Preserve copyright header. (Antonio Argenziano)
 * Simplify engines_ptr macro. (Rinat Ibragimov)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1213 ++++++++++++++++++++++++++-----------------------
 tools/meson.build     |    6 +-
 5 files changed, 664 insertions(+), 564 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..2f5b7badbbad 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2007-2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,695 +23,783 @@
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+#include "igt_perf.h"
+
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	unsigned int num_counters;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) (&engines->engine + (n))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = -1;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = -1;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = -1;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = -1;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = -ENOMEM;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret)
+		free(engines);
+	else {
+		qsort(engine_ptr(engines, 0), engines->num_engines,
+		      sizeof(struct engine), engine_cmp);
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		engines->root = d;
 	}
 
+	return ret == 0 ? engines : NULL;
+}
+
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd;
+	ssize_t ret;
 
-	printf("\n");
-	return -1;
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	close(fd);
+	if (ret < 1)
+		return -1;
+
+	buf[ret] = '\0';
+
+	return 0;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static uint64_t filename_to_u64(const char *filename, int base)
+{
+	char buf[64], *b;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char *oldlocale;
+	char buf[80];
+	double v;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
+}
+
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64("/sys/devices/power/type", 10);
+}
+
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
+}
+
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
+}
+
+static uint64_t imc_type_id(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/type", 10);
+}
+
+static uint64_t imc_data_reads(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_reads", 0);
+}
+
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double("/sys/devices/uncore_imc/events/data_reads.scale");
+}
+
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_reads.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64("/sys/devices/uncore_imc/events/data_writes", 0);
+}
+
+static double imc_data_writes_scale(void)
+{
+	return filename_to_double("/sys/devices/uncore_imc/events/data_writes.scale");
+}
+
+static const char *imc_data_writes_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf("/sys/devices/uncore_imc/events/data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd >= 0)
+				engine->num_counters++;
+		}
 	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		if (engines->rapl_scale != NAN)
+			engines->rapl_scale *= 1e3; /* from nano to micro */
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
 	}
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
-}
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
+
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	return INREG(ring->mmio + reg);
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_init(struct ring *ring)
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
 }
 
-static void ring_reset(struct ring *ring)
+static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
 {
-	ring->idle = ring->full = 0;
+	unsigned int i;
+
+	for (i = 0; i < num && i < (bufsz - 1); i++)
+		*buf++ = c;
+
+	*buf = 0;
 }
 
-static void ring_sample(struct ring *ring)
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz,
+		     unsigned int width, unsigned width_dec,
+		     double d, double t, double s)
 {
-	int full;
+	double val;
+	int len;
+
+	assert(bufsz >= (width + width_dec + 1));
 
-	if (!ring->size)
+	if (!cnt->present) {
+		fill_str(buf, bufsz, '-', width + width_dec);
 		return;
+	}
+
+	val = __pmu_calc(&cnt->val, d, t, s);
+
+	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
+	if (len < 0 || len == bufsz) {
+		fill_str(buf, bufsz, 'X', width + width_dec);
+		return;
+	}
+}
+
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
+{
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (ts)
+		*ts = data[1];
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+	return data[0];
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static uint64_t pmu_read_single(int fd)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return __pmu_read_single(fd, NULL);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
 {
-	int percent_busy, len;
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
 
-	if (!ring->size)
-		return;
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	engines->ts.prev = engines->ts.cur;
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
+
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
+
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
-	}
-
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
-
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! Kernel 4.16 or newer?\n");
+		return 1;
 	}
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
-
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
-
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+	pmu_sample(engines);
 
 	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
 		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
+		int lines = 0;
+
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
 		}
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
+
+		printf("\033[H\033[J");
+
+		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, 6, 0, 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_writes_scale);
+
+		printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %smW; %s irqs/s\n",
+		       fact, freq, rc6, power, irq);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC reads:", reads, engines->imc_reads_unit);
+		lines++;
+
+		printf("%16s %s %s/s\n",
+		       "IMC writes:", writes, engines->imc_writes_unit);
+		lines++;
+
+		printf("\n");
+		lines++;
+
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			if (!engine->num_counters)
+				continue;
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t,
+				 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t,
+				 100);
+			len = snprintf(buf, sizeof(buf), "%s%% sema, %s%% wait",
+				       sema, wait);
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
-				printf("\n");
-			}
-		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
-
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			printf("%s\n", buf);
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
+			lines++;
 		}
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
-				continue;
-			if (WIFEXITED(child_stat))
-				break;
-		}
-	}
+		printf("\n");
 
-	fclose(output);
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-29 14:30     ` [Intel-gfx] " Eero Tamminen
@ 2018-04-03  9:36       ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03  9:36 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 29/03/2018 15:30, Eero Tamminen wrote:
> Hi,
> 
> I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
> with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
> and latest drm-tip build (4.16.0-rc7).
> 
> 
> General comments
> ----------------
> 
> This will be used by our customers and people who aren't necessarily
> familiar with i915 internal details.  Therefore it should use
> common terminology in the field and in similar tools, instead of
> I3As (Intel 3-letter Acronyms).
> 
> For example:
>   - rcs -> 3D render
>   - bcs -> blitter
>   - vecs -> video
>   - vcs -> video decode
> etc.

Done. And I am open to bike-shedding of the names and display format for 
instance reporting.

> 
> Old tool showed also GPU system memory interface (GAM) busyness.
> That was valuable info, and reasonably accurate for stable loads.
> 
> Could this tool show also either that information (preferred), or
> bandwidth utilized by GPU/CPU/display?
> 
> (Latest kernels offer GPU memory bandwidth usage through perf
> "uncore_imc" "data_reads" & "date_writes" counters.)

Excellent suggestion and I've added IMC data_reads and data_writes to 
the tool.

> 
> 
> Is "wait" value supposed to be IO-wait for given engine interface?
>
> I never saw that change from 0%, although IO-wait in top jumped
> from 0 to 20-30% with my test GPU load.

No, that is time spent in MI_WAIT_FOR_EVENT. I think not very used in 
current codebase.

> 
> HW specific test results
> ------------------------
> 
> BYT:
> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>    works fine.
> 
> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
> for the "wait" value.
> 
> I never saw blitter engine to do anything, but that's because
> modesetting uses just 3D pipeline, and because I couldn't get
> Intel DDX to work with rest of latest git version of X / 3D stack.

Thank you for testing this so thoroughly - this was really invaluable 
since I don't have access too such number of platforms. I've tried to 
fix all this in the latest version.

> 
> 
> 
> Kernel version support
> ----------------------
> 
> My HW specific testing above was with drm-tip kernel, but I did one test
> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
> backport) on KBL.  For that, the tool reported:
> "Failed to detect engines!"
> 
> Although the previous intel_gpu_top works fine with that kernel version.
> 
> Same happens also with Ubuntu 17.04 v4.13 kernel.
> 
> 
> -> If new version needs a certain kernel version, it should tell
>     which version is required.

Yep, at least 4.16 is needed so I have added this info to the error message.

Thanks again for testing it and when you find the time if you could do 
it once more with the latest version (on the problematic platforms) that 
would be much appreciated.

Regards,

Tvrtko

> 
> 
>      - Eero
> 
> On 29.03.2018 13:33, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
>> register access. This patch rewrites it to use only PMU.
>>
>> Only overall command streamer busyness and GPU global data such as power
>> and frequencies are included in this new version.
>>
>> For access to more GPU functional unit level data, an OA metric based 
>> tool
>> like gpu-top should be used instead.
>>
>> v2:
>>   * Sort engines by class and instance.
>>   * Do not wait for one sampling period to display something on screen.
>>   * Move code out of the asserts. (Rinat Ibragimov)
>>   * Continuously adapt to terminal size. (Rinat Ibgragimov)
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> Cc: Petri Latvala <petri.latvala@intel.com>
>> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
>> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
>> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
>> ---
>>   tools/Makefile.am     |    2 +
>>   tools/intel_gpu_top.c | 1009 
>> +++++++++++++++++++++----------------------------
>>   tools/meson.build     |    6 +-
>>   3 files changed, 441 insertions(+), 576 deletions(-)
>>
>> diff --git a/tools/Makefile.am b/tools/Makefile.am
>> index 09b6dbcc3ece..a0b016ddd7ff 100644
>> --- a/tools/Makefile.am
>> +++ b/tools/Makefile.am
>> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version 
>> -no-undefined
>>   intel_aubdump_la_SOURCES = aubdump.c
>>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
>> +
>>   bin_SCRIPTS = intel_aubdump
>>   CLEANFILES = $(bin_SCRIPTS)
>> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
>> index 098e6ce3ff86..94091d97c4a3 100644
>> --- a/tools/intel_gpu_top.c
>> +++ b/tools/intel_gpu_top.c
>> @@ -1,6 +1,5 @@
>>   /*
>> - * Copyright © 2007 Intel Corporation
>> - * Copyright © 2011 Intel Corporation
>> + * Copyright © 2018 Intel Corporation
>>    *
>>    * Permission is hereby granted, free of charge, to any person 
>> obtaining a
>>    * copy of this software and associated documentation files (the 
>> "Software"),
>> @@ -18,701 +17,561 @@
>>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>> EVENT SHALL
>>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING
>> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> - * DEALINGS IN THE SOFTWARE.
>> - *
>> - * Authors:
>> - *    Eric Anholt <eric@anholt.net>
>> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
>> - *
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
>> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>>    */
>> -#include "config.h"
>> -
>> -#include <inttypes.h>
>> -#include <unistd.h>
>> -#include <stdlib.h>
>>   #include <stdio.h>
>> -#include <err.h>
>> -#include <sys/ioctl.h>
>> -#include <sys/time.h>
>> -#include <sys/wait.h>
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +#include <stdint.h>
>> +#include <assert.h>
>>   #include <string.h>
>> -#ifdef HAVE_TERMIOS_H
>> -#include <termios.h>
>> -#endif
>> -#include "intel_io.h"
>> -#include "instdone.h"
>> -#include "intel_reg.h"
>> -#include "intel_chipset.h"
>> -#include "drmtest.h"
>> -
>> -#define  FORCEWAKE        0xA18C
>> -#define  FORCEWAKE_ACK        0x130090
>> -
>> -#define SAMPLES_PER_SEC             10000
>> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
>> -
>> -#define MAX_NUM_TOP_BITS            100
>> -
>> -#define HAS_STATS_REGS(devid)        IS_965(devid)
>> -
>> -struct top_bit {
>> -    struct instdone_bit *bit;
>> -    int count;
>> -} top_bits[MAX_NUM_TOP_BITS];
>> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
>> -
>> -static uint32_t instdone, instdone1;
>> -
>> -static const char *bars[] = {
>> -    " ",
>> -    "▏",
>> -    "▎",
>> -    "▍",
>> -    "▌",
>> -    "▋",
>> -    "▊",
>> -    "▉",
>> -    "█"
>> -};
>> +#include <ctype.h>
>> +#include <stdlib.h>
>> +#include <unistd.h>
>> +#include <sys/stat.h>
>> +#include <fcntl.h>
>> +#include <inttypes.h>
>> +#include <sys/ioctl.h>
>> +#include <errno.h>
>> +#include <math.h>
>> +#include <locale.h>
>> +
>> +#include "igt_perf.h"
>> -enum stats_counts {
>> -    IA_VERTICES,
>> -    IA_PRIMITIVES,
>> -    VS_INVOCATION,
>> -    GS_INVOCATION,
>> -    GS_PRIMITIVES,
>> -    CL_INVOCATION,
>> -    CL_PRIMITIVES,
>> -    PS_INVOCATION,
>> -    PS_DEPTH,
>> -    STATS_COUNT
>> +struct pmu_pair {
>> +    uint64_t cur;
>> +    uint64_t prev;
>>   };
>> -const uint32_t stats_regs[STATS_COUNT] = {
>> -    IA_VERTICES_COUNT_QW,
>> -    IA_PRIMITIVES_COUNT_QW,
>> -    VS_INVOCATION_COUNT_QW,
>> -    GS_INVOCATION_COUNT_QW,
>> -    GS_PRIMITIVES_COUNT_QW,
>> -    CL_INVOCATION_COUNT_QW,
>> -    CL_PRIMITIVES_COUNT_QW,
>> -    PS_INVOCATION_COUNT_QW,
>> -    PS_DEPTH_COUNT_QW,
>> +struct pmu_counter {
>> +    uint64_t config;
>> +    unsigned int idx;
>> +    struct pmu_pair val;
>>   };
>> -const char *stats_reg_names[STATS_COUNT] = {
>> -    "vert fetch",
>> -    "prim fetch",
>> -    "VS invocations",
>> -    "GS invocations",
>> -    "GS prims",
>> -    "CL invocations",
>> -    "CL prims",
>> -    "PS invocations",
>> -    "PS depth pass",
>> +struct engine {
>> +    const char *name;
>> +    struct pmu_counter busy;
>> +    struct pmu_counter wait;
>> +    struct pmu_counter sema;
>>   };
>> -uint64_t stats[STATS_COUNT];
>> -uint64_t last_stats[STATS_COUNT];
>> +struct engines {
>> +    unsigned int num_engines;
>> +    unsigned int num_counters;
>> +    DIR *root;
>> +    int fd;
>> +    struct pmu_pair ts;
>> -static unsigned long
>> -gettime(void)
>> -{
>> -    struct timeval t;
>> -    gettimeofday(&t, NULL);
>> -    return (t.tv_usec + (t.tv_sec * 1000000));
>> -}
>> +    int rapl_fd;
>> +    double rapl_scale;
>> -static int
>> -top_bits_sort(const void *a, const void *b)
>> +    struct pmu_counter freq_req;
>> +    struct pmu_counter freq_act;
>> +    struct pmu_counter irq;
>> +    struct pmu_counter rc6;
>> +    struct pmu_counter rapl;
>> +
>> +    struct engine engine;
>> +};
>> +
>> +static uint64_t
>> +get_pmu_config(int dirfd, const char *name, const char *counter)
>>   {
>> -    struct top_bit * const *bit_a = a;
>> -    struct top_bit * const *bit_b = b;
>> -    int a_count = (*bit_a)->count;
>> -    int b_count = (*bit_b)->count;
>> +    char buf[128], *p;
>> +    int fd, ret;
>> -    if (a_count < b_count)
>> -        return 1;
>> -    else if (a_count == b_count)
>> -        return 0;
>> -    else
>> +    ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
>> +    if (ret < 0 || ret == sizeof(buf))
>>           return -1;
>> -}
>> -static void
>> -update_idle_bit(struct top_bit *top_bit)
>> -{
>> -    uint32_t reg_val;
>> +    fd = openat(dirfd, buf, O_RDONLY);
>> +    if (fd < 0)
>> +        return -1;
>> -    if (top_bit->bit->reg == INSTDONE_1)
>> -        reg_val = instdone1;
>> -    else
>> -        reg_val = instdone;
>> +    ret = read(fd, buf, sizeof(buf));
>> +    close(fd);
>> +    if (ret <= 0)
>> +        return -1;
>> +
>> +    p = index(buf, '0');
>> +    if (!p)
>> +        return -1;
>> -    if ((reg_val & top_bit->bit->bit) == 0)
>> -        top_bit->count++;
>> +    return strtoul(p, NULL, 0);
>>   }
>> -static void
>> -print_clock(const char *name, int clock) {
>> -    if (clock == -1)
>> -        printf("%s clock: unknown", name);
>> +#define engine_ptr(engines, n) \
>> +    ((struct engine *)((unsigned char *)(&engines->engine) + (n) * 
>> sizeof(struct engine)))
>> +
>> +static int engine_cmp(const void *__a, const void *__b)
>> +{
>> +    const struct engine *a = (struct engine *)__a;
>> +    const struct engine *b = (struct engine *)__b;
>> +    int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>> +              I915_PMU_CLASS_SHIFT;
>> +    int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>> +              I915_PMU_CLASS_SHIFT;
>> +    int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>> +    int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>> +
>> +    if (class_a != class_b)
>> +        return class_a - class_b;
>>       else
>> -        printf("%s clock: %d Mhz", name, clock);
>> +        return instance_a - instance_b;
>>   }
>> -static int
>> -print_clock_info(struct pci_device *pci_dev)
>> +static struct engines *discover_engines(void)
>>   {
>> -    uint32_t devid = pci_dev->device_id;
>> -    uint16_t gcfgc;
>> +    const char *sysfs_root = "/sys/devices/i915/events";
>> +    struct engines *engines;
>> +    struct dirent *dent;
>> +    int ret = 0;
>> +    DIR *d;
>> -    if (IS_GM45(devid)) {
>> -        int core_clock = -1;
>> +    engines = malloc(sizeof(struct engines));
>> +    if (!engines)
>> +        return NULL;
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +    memset(engines, 0, sizeof(*engines));
>> -        switch (gcfgc & 0xf) {
>> -        case 8:
>> -            core_clock = 266;
>> -            break;
>> -        case 9:
>> -            core_clock = 320;
>> -            break;
>> -        case 11:
>> -            core_clock = 400;
>> -            break;
>> -        case 13:
>> -            core_clock = 533;
>> -            break;
>> -        }
>> -        print_clock("core", core_clock);
>> -    } else if (IS_965(devid) && IS_MOBILE(devid)) {
>> -        int render_clock = -1, sampler_clock = -1;
>> +    engines->num_engines = 0;
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +    d = opendir(sysfs_root);
>> +    if (!d)
>> +        return NULL;
>> -        switch (gcfgc & 0xf) {
>> -        case 2:
>> -            render_clock = 250; sampler_clock = 267;
>> -            break;
>> -        case 3:
>> -            render_clock = 320; sampler_clock = 333;
>> -            break;
>> -        case 4:
>> -            render_clock = 400; sampler_clock = 444;
>> -            break;
>> -        case 5:
>> -            render_clock = 500; sampler_clock = 533;
>> -            break;
>> -        }
>> -
>> -        print_clock("render", render_clock);
>> -        printf("  ");
>> -        print_clock("sampler", sampler_clock);
>> -    } else if (IS_945(devid) && IS_MOBILE(devid)) {
>> -        int render_clock = -1, display_clock = -1;
>> +    while ((dent = readdir(d)) != NULL) {
>> +        const char *endswith = "-busy";
>> +        const unsigned int endlen = strlen(endswith);
>> +        struct engine *engine =
>> +                engine_ptr(engines, engines->num_engines);
>> +        char buf[256];
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +        if (dent->d_type != DT_REG)
>> +            continue;
>> -        switch (gcfgc & 0x7) {
>> -        case 0:
>> -            render_clock = 166;
>> -            break;
>> -        case 1:
>> -            render_clock = 200;
>> -            break;
>> -        case 3:
>> -            render_clock = 250;
>> -            break;
>> -        case 5:
>> -            render_clock = 400;
>> +        if (strlen(dent->d_name) >= sizeof(buf)) {
>> +            ret = -1;
>>               break;
>>           }
>> -        switch (gcfgc & 0x70) {
>> -        case 0:
>> -            display_clock = 200;
>> -            break;
>> -        case 4:
>> -            display_clock = 320;
>> -            break;
>> -        }
>> -        if (gcfgc & (1 << 7))
>> -            display_clock = 133;
>> +        strcpy(buf, dent->d_name);
>> -        print_clock("render", render_clock);
>> -        printf("  ");
>> -        print_clock("display", display_clock);
>> -    } else if (IS_915(devid) && IS_MOBILE(devid)) {
>> -        int render_clock = -1, display_clock = -1;
>> +        /* xxxN-busy */
>> +        if (strlen(buf) < (endlen + 4))
>> +            continue;
>> +        if (strcmp(&buf[strlen(buf) - endlen], endswith))
>> +            continue;
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +        memset(engine, 0, sizeof(*engine));
>> -        switch (gcfgc & 0x7) {
>> -        case 0:
>> -            render_clock = 160;
>> -            break;
>> -        case 1:
>> -            render_clock = 190;
>> -            break;
>> -        case 4:
>> -            render_clock = 333;
>> +        buf[strlen(buf) - endlen] = 0;
>> +        engine->name = strdup(buf);
>> +        if (!engine->name) {
>> +            ret = -1;
>>               break;
>>           }
>> -        if (gcfgc & (1 << 13))
>> -            render_clock = 133;
>> -        switch (gcfgc & 0x70) {
>> -        case 0:
>> -            display_clock = 190;
>> +        engine->busy.config = get_pmu_config(dirfd(d), engine->name,
>> +                             "busy");
>> +        if (engine->busy.config == -1) {
>> +            ret = -1;
>>               break;
>> -        case 4:
>> -            display_clock = 333;
>> +        }
>> +
>> +        engines->num_engines++;
>> +        engines = realloc(engines, sizeof(struct engines) +
>> +                  engines->num_engines * sizeof(struct engine));
>> +        if (!engines) {
>> +            ret = -ENOMEM;
>>               break;
>>           }
>> -        if (gcfgc & (1 << 7))
>> -            display_clock = 133;
>> +    }
>> +
>> +    if (ret)
>> +        free(engines);
>> +    else {
>> +        qsort(engine_ptr(engines, 0), engines->num_engines,
>> +              sizeof(struct engine), engine_cmp);
>> -        print_clock("render", render_clock);
>> -        printf("  ");
>> -        print_clock("display", display_clock);
>> +        engines->root = d;
>>       }
>> +    return ret == 0 ? engines : NULL;
>> +}
>> +
>> +static int
>> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
>> +{
>> +    int fd;
>> +    ssize_t ret;
>> +
>> +    fd = open(filename, O_RDONLY);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    ret = read(fd, buf, bufsize - 1);
>> +    close(fd);
>> +    if (ret < 1)
>> +        return -1;
>> +
>> +    buf[ret] = '\0';
>> -    printf("\n");
>> -    return -1;
>> +    return 0;
>>   }
>> -#define STATS_LEN (20)
>> -#define PERCENTAGE_BAR_END    (79 - STATS_LEN)
>> +static uint64_t filename_to_u64(const char *filename, int base)
>> +{
>> +    char buf[64], *b;
>> -static void
>> -print_percentage_bar(float percent, int cur_line_len)
>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>> +        return 0;
>> +
>> +    /*
>> +     * Handle both single integer and key=value formats by skipping
>> +     * leading non-digits.
>> +     */
>> +    b = buf;
>> +    while (*b && !isdigit(*b))
>> +        b++;
>> +
>> +    return strtoull(b, NULL, base);
>> +}
>> +
>> +static uint64_t rapl_type_id(void)
>>   {
>> -    int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
>> -    int bar_len = bar_avail_len * (percent + .5) / 100.0;
>> -    int i;
>> +    return filename_to_u64("/sys/devices/power/type", 10);
>> +}
>> -    for (i = bar_len; i >= 8; i -= 8) {
>> -        printf("%s", bars[8]);
>> -        cur_line_len++;
>> -    }
>> -    if (i) {
>> -        printf("%s", bars[i]);
>> -        cur_line_len++;
>> -    }
>> +static uint64_t rapl_gpu_power(void)
>> +{
>> +    return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
>> +}
>> +
>> +static double filename_to_double(const char *filename)
>> +{
>> +    char *oldlocale;
>> +    char buf[80];
>> +    double v;
>> +
>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>> +        return 0;
>> -    /* NB: We can't use a field width with utf8 so we manually
>> -    * guarantee a field with of 45 chars for any bar. */
>> -    printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
>> +    oldlocale = setlocale(LC_ALL, "C");
>> +    v = strtod(buf, NULL);
>> +    setlocale(LC_ALL, oldlocale);
>> +
>> +    return v;
>>   }
>> -struct ring {
>> -    const char *name;
>> -    uint32_t mmio;
>> -    int head, tail, size;
>> -    uint64_t full;
>> -    int idle;
>> -};
>> +static double rapl_gpu_power_scale(void)
>> +{
>> +    return 
>> filename_to_double("/sys/devices/power/events/energy-gpu.scale");
>> +}
>> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
>> +#define __open_pmu(engines, pmu, idx) \
>> +({ \
>> +    int fd__; \
>> +\
>> +    fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
>> +    if (fd__ >= 0) { \
>> +        if ((engines)->fd == -1) \
>> +            (engines)->fd = fd__; \
>> +        (pmu)->idx = (idx)++; \
>> +        (engines)->num_counters++; \
>> +    } \
>> +\
>> +    fd__; \
>> +})
>> +
>> +static int pmu_init(struct engines *engines)
>>   {
>> -    return INREG(ring->mmio + reg);
>> +    unsigned int idx = 0;
>> +    unsigned int i;
>> +    int fd;
>> +
>> +    engines->fd = -1;
>> +    engines->num_counters = 0;
>> +
>> +    engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
>> +    fd = __open_pmu(engines, &engines->freq_req, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
>> +    fd = __open_pmu(engines, &engines->freq_act, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    engines->irq.config = I915_PMU_INTERRUPTS;
>> +    fd = __open_pmu(engines, &engines->irq, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    engines->rc6.config = I915_PMU_RC6_RESIDENCY;
>> +    fd = __open_pmu(engines, &engines->rc6, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    for (i = 0; i < engines->num_engines; i++) {
>> +        struct engine *engine = engine_ptr(engines, i);
>> +        struct {
>> +            struct pmu_counter *pmu;
>> +            const char *counter;
>> +        } *cnt, counters[] = {
>> +            { .pmu = &engine->busy, .counter = "busy" },
>> +            { .pmu = &engine->wait, .counter = "wait" },
>> +            { .pmu = &engine->sema, .counter = "sema" },
>> +            { .pmu = NULL, .counter = NULL },
>> +        };
>> +
>> +        for (cnt = counters; cnt->pmu; cnt++) {
>> +            if (!cnt->pmu->config)
>> +                cnt->pmu->config =
>> +                    get_pmu_config(dirfd(engines->root),
>> +                               engine->name,
>> +                               cnt->counter);
>> +            fd = __open_pmu(engines, cnt->pmu, idx);
>> +            if (fd < 0)
>> +                return -1;
>> +        }
>> +    }
>> +
>> +    engines->rapl_scale = rapl_gpu_power_scale();
>> +    if (engines->rapl_scale != NAN)
>> +        engines->rapl_scale *= 1e3; /* from nano to micro */
>> +    engines->rapl.config = rapl_gpu_power();
>> +    engines->rapl_fd = igt_perf_open(rapl_type_id(), 
>> engines->rapl.config);
>> +    if (engines->rapl_fd < 0)
>> +        return -1;
>> +
>> +    return 0;
>>   }
>> -static void ring_init(struct ring *ring)
>> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>>   {
>> -    ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) 
>> + 1) * 4096;
>> +    uint64_t buf[2 + num];
>> +    unsigned int i;
>> +    ssize_t len;
>> +
>> +    memset(buf, 0, sizeof(buf));
>> +
>> +    len = read(fd, buf, sizeof(buf));
>> +    assert(len == sizeof(buf));
>> +
>> +    for (i = 0; i < num; i++)
>> +        val[i] = buf[2 + i];
>> +
>> +    return buf[1];
>>   }
>> -static void ring_reset(struct ring *ring)
>> +static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
>>   {
>> -    ring->idle = ring->full = 0;
>> +    double pct;
>> +
>> +    pct = p->cur - p->prev;
>> +    pct /= d;
>> +    pct /= t;
>> +    pct *= s;
>> +
>> +    if (s == 100.0 && pct > 100.0)
>> +        pct = 100.0;
>> +
>> +    return pct;
>>   }
>> -static void ring_sample(struct ring *ring)
>> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>>   {
>> -    int full;
>> +    uint64_t data[2] = { };
>> +    ssize_t len;
>> -    if (!ring->size)
>> -        return;
>> +    len = read(fd, data, sizeof(data));
>> +    assert(len == sizeof(data));
>> -    ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
>> -    ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
>> +    if (ts)
>> +        *ts = data[1];
>> +
>> +    return data[0];
>> +}
>> -    if (ring->tail == ring->head)
>> -        ring->idle++;
>> +static uint64_t pmu_read_single(int fd)
>> +{
>> +    return __pmu_read_single(fd, NULL);
>> +}
>> -    full = ring->tail - ring->head;
>> -    if (full < 0)
>> -        full += ring->size;
>> -    ring->full += full;
>> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>> +{
>> +    counter->val.prev = counter->val.cur;
>> +    counter->val.cur = val;
>>   }
>> -static void ring_print_header(FILE *out, struct ring *ring)
>> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>>   {
>> -    fprintf(out, "%.6s%%\tops\t",
>> -            ring->name
>> -          );
>> +    __update_sample(counter, val[counter->idx]);
>>   }
>> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
>> +static void pmu_sample(struct engines *engines)
>>   {
>> -    int percent_busy, len;
>> +    const int num_val = engines->num_counters;
>> +    uint64_t val[num_val];
>> +    unsigned int i;
>> +
>> +    engines->ts.prev = engines->ts.cur;
>> +    engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>> +
>> +    __update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>> -    if (!ring->size)
>> -        return;
>> +    update_sample(&engines->freq_req, val);
>> +    update_sample(&engines->freq_act, val);
>> +    update_sample(&engines->irq, val);
>> +    update_sample(&engines->rc6, val);
>> -    percent_busy = 100 - 100 * ring->idle / samples_per_sec;
>> +    for (i = 0; i < engines->num_engines; i++) {
>> +        struct engine *engine = engine_ptr(engines, i);
>> -    len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
>> -    print_percentage_bar (percent_busy, len);
>> -    printf("%24s space: %d/%d\n",
>> -           ring->name,
>> -           (int)(ring->full / samples_per_sec),
>> -           ring->size);
>> +        update_sample(&engine->busy, val);
>> +        update_sample(&engine->sema, val);
>> +        update_sample(&engine->wait, val);
>> +    }
>>   }
>> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
>> -        FILE *output)
>> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", 
>> "█" };
>> +
>> +static void
>> +print_percentage_bar(double percent, int max_len)
>>   {
>> -    if (ring->size)
>> -        fprintf(output, "%3d\t%d\t",
>> -            (int)(100 - 100 * ring->idle / samples_per_sec),
>> -            (int)(ring->full / samples_per_sec));
>> -    else
>> -        fprintf(output, "-1\t-1\t");
>> +    int bar_len = percent * (8 * (max_len - 2)) / 100.0;
>> +    int i;
>> +
>> +    putchar('|');
>> +
>> +    for (i = bar_len; i >= 8; i -= 8)
>> +        printf("%s", bars[8]);
>> +    if (i)
>> +        printf("%s", bars[i]);
>> +
>> +    for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
>> +        putchar(' ');
>> +
>> +    putchar('|');
>>   }
>> +#define DEFAULT_PERIOD_MS (1000)
>> +
>>   static void
>>   usage(const char *appname)
>>   {
>>       printf("intel_gpu_top - Display a top-like summary of Intel GPU 
>> usage\n"
>> -            "\n"
>> -            "usage: %s [parameters]\n"
>> -            "\n"
>> -            "The following parameters apply:\n"
>> -            "[-s <samples>]       samples per seconds (default %d)\n"
>> -            "[-e <command>]       command to profile\n"
>> -            "[-o <file>]          output statistics to file. If file 
>> is '-',"
>> -            "                     run in batch mode and output 
>> statistics to stdio only \n"
>> -            "[-h]                 show this help screen\n"
>> -            "\n",
>> -            appname,
>> -            SAMPLES_PER_SEC
>> -          );
>> -    return;
>> +        "\n"
>> +        "Usage: %s [parameters]\n"
>> +        "\n"
>> +        "\tThe following parameters are optional:\n"
>> +        "\t[-s <samples>]       refresh period in ms (default %ums)\n"
>> +        "\t[-h]                 show this help text\n"
>> +        "\n",
>> +        appname, DEFAULT_PERIOD_MS);
>>   }
>>   int main(int argc, char **argv)
>>   {
>> -    uint32_t devid;
>> -    struct pci_device *pci_dev;
>> -    struct ring render_ring = {
>> -        .name = "render",
>> -        .mmio = 0x2030,
>> -    }, bsd_ring = {
>> -        .name = "bitstream",
>> -        .mmio = 0x4030,
>> -    }, bsd6_ring = {
>> -        .name = "bitstream",
>> -        .mmio = 0x12030,
>> -    }, blt_ring = {
>> -        .name = "blitter",
>> -        .mmio = 0x22030,
>> -    };
>> -    int i, ch;
>> -    int samples_per_sec = SAMPLES_PER_SEC;
>> -    FILE *output = NULL;
>> -    double elapsed_time=0;
>> -    int print_headers=1;
>> -    pid_t child_pid=-1;
>> -    int child_stat;
>> -    char *cmd=NULL;
>> -    int interactive=1;
>> -
>> -    /* Parse options? */
>> -    while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
>> +    unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
>> +    int con_w = -1, con_h = -1;
>> +    struct engines *engines;
>> +    unsigned int i;
>> +    int ret, ch;
>> +
>> +    /* Parse options */
>> +    while ((ch = getopt(argc, argv, "s:h")) != -1) {
>>           switch (ch) {
>> -        case 'e': cmd = strdup(optarg);
>> -            break;
>> -        case 's': samples_per_sec = atoi(optarg);
>> -            if (samples_per_sec < 100) {
>> -                fprintf(stderr, "Error: samples per second must be >= 
>> 100\n");
>> -                exit(1);
>> -            }
>> -            break;
>> -        case 'o':
>> -            if (!strcmp(optarg, "-")) {
>> -                /* Running in non-interactive mode */
>> -                interactive = 0;
>> -                output = stdout;
>> -            }
>> -            else
>> -                output = fopen(optarg, "w");
>> -            if (!output)
>> -            {
>> -                perror("fopen");
>> -                exit(1);
>> -            }
>> +        case 's':
>> +            period_us = atoi(optarg) * 1000;
>>               break;
>>           case 'h':
>>               usage(argv[0]);
>>               exit(0);
>> -            break;
>>           default:
>> -            fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
>> +            fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>>               usage(argv[0]);
>>               exit(1);
>> -            break;
>>           }
>>       }
>> -    pci_dev = intel_get_pci_device();
>> -    devid = pci_dev->device_id;
>> -    intel_mmio_use_pci_bar(pci_dev);
>> -    init_instdone_definitions(devid);
>> -
>> -    /* Do we have a command to run? */
>> -    if (cmd != NULL) {
>> -        if (output) {
>> -            fprintf(output, "# Profiling: %s\n", cmd);
>> -            fflush(output);
>> -        }
>> -        child_pid = fork();
>> -        if (child_pid < 0) {
>> -            perror("fork");
>> -            exit(1);
>> -        }
>> -        else if (child_pid == 0) {
>> -            int res;
>> -            res = system(cmd);
>> -            if (res < 0)
>> -                perror("running command");
>> -            if (output) {
>> -                fflush(output);
>> -                fprintf(output, "# %s exited with status %d\n", cmd, 
>> res);
>> -                fflush(output);
>> -            }
>> -            free(cmd);
>> -            exit(0);
>> -        } else {
>> -            free(cmd);
>> -        }
>> +    engines = discover_engines();
>> +    if (!engines) {
>> +        fprintf(stderr, "Failed to detect engines!\n");
>> +        return 1;
>>       }
>> -    for (i = 0; i < num_instdone_bits; i++) {
>> -        top_bits[i].bit = &instdone_bits[i];
>> -        top_bits[i].count = 0;
>> -        top_bits_sorted[i] = &top_bits[i];
>> +    ret = pmu_init(engines);
>> +    if (ret) {
>> +        fprintf(stderr, "Failed to initialize PMU!\n");
>> +        return 1;
>>       }
>> -    /* Grab access to the registers */
>> -    intel_register_access_init(pci_dev, 0, -1);
>> +    pmu_sample(engines);
>> -    ring_init(&render_ring);
>> -    if (IS_GEN4(devid) || IS_GEN5(devid))
>> -        ring_init(&bsd_ring);
>> -    if (IS_GEN6(devid) || IS_GEN7(devid)) {
>> -        ring_init(&bsd6_ring);
>> -        ring_init(&blt_ring);
>> -    }
>> +    for (;;) {
>> +        double t, freq[2], irq, rc6, power;
>> +        struct winsize ws;
>> +        int lines = 0;
>> -    /* Initialize GPU stats */
>> -    if (HAS_STATS_REGS(devid)) {
>> -        for (i = 0; i < STATS_COUNT; i++) {
>> -            uint32_t stats_high, stats_low, stats_high_2;
>> +        /* Update terminal size. */
>> +        if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>> +            con_w = ws.ws_col;
>> +            con_h = ws.ws_row;
>> +        }
>> -            do {
>> -                stats_high = INREG(stats_regs[i] + 4);
>> -                stats_low = INREG(stats_regs[i]);
>> -                stats_high_2 = INREG(stats_regs[i] + 4);
>> -            } while (stats_high != stats_high_2);
>> +        pmu_sample(engines);
>> +        t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>> -            last_stats[i] = (uint64_t)stats_high << 32 |
>> -                stats_low;
>> -        }
>> -    }
>> +        printf("\033[H\033[J");
>> -    for (;;) {
>> -        int j;
>> -        unsigned long long t1, ti, tf, t2;
>> -        unsigned long long def_sleep = 1000000 / samples_per_sec;
>> -        unsigned long long last_samples_per_sec = samples_per_sec;
>> -        unsigned short int max_lines;
>> -        struct winsize ws;
>> -        char clear_screen[] = {0x1b, '[', 'H',
>> -                       0x1b, '[', 'J',
>> -                       0x0};
>> -        int percent;
>> -        int len;
>> -
>> -        t1 = gettime();
>> -
>> -        ring_reset(&render_ring);
>> -        ring_reset(&bsd_ring);
>> -        ring_reset(&bsd6_ring);
>> -        ring_reset(&blt_ring);
>> -
>> -        for (i = 0; i < samples_per_sec; i++) {
>> -            long long interval;
>> -            ti = gettime();
>> -            if (IS_965(devid)) {
>> -                instdone = INREG(INSTDONE_I965);
>> -                instdone1 = INREG(INSTDONE_1);
>> -            } else
>> -                instdone = INREG(INSTDONE);
>> -
>> -            for (j = 0; j < num_instdone_bits; j++)
>> -                update_idle_bit(&top_bits[j]);
>> -
>> -            ring_sample(&render_ring);
>> -            ring_sample(&bsd_ring);
>> -            ring_sample(&bsd6_ring);
>> -            ring_sample(&blt_ring);
>> -
>> -            tf = gettime();
>> -            if (tf - t1 >= 1000000) {
>> -                /* We are out of sync, bail out */
>> -                last_samples_per_sec = i+1;
>> -                break;
>> -            }
>> -            interval = def_sleep - (tf - ti);
>> -            if (interval > 0)
>> -                usleep(interval);
>> -        }
>> +        freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
>> +        freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
>> +        irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
>> +        rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
>> +        power = pmu_calc(&engines->rapl.val, 1.0, t,
>> +                 engines->rapl_scale);
>> -        if (HAS_STATS_REGS(devid)) {
>> -            for (i = 0; i < STATS_COUNT; i++) {
>> -                uint32_t stats_high, stats_low, stats_high_2;
>> +        printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; 
>> %6.0fmW; %8.0f irqs/s\n",
>> +               freq[0], freq[1], rc6, power, irq);
>> +        lines++;
>> -                do {
>> -                    stats_high = INREG(stats_regs[i] + 4);
>> -                    stats_low = INREG(stats_regs[i]);
>> -                    stats_high_2 = INREG(stats_regs[i] + 4);
>> -                } while (stats_high != stats_high_2);
>> +        printf("\n");
>> +        lines++;
>> -                stats[i] = (uint64_t)stats_high << 32 |
>> -                    stats_low;
>> -            }
>> -        }
>> +        for (i = 0; i < engines->num_engines && lines < con_h; i++) {
>> +            struct engine *engine = engine_ptr(engines, i);
>> +            unsigned int max_w = con_w - 1;
>> +            unsigned int len;
>> +            double val[2];
>> +            char buf[128];
>> -        qsort(top_bits_sorted, num_instdone_bits,
>> -              sizeof(struct top_bit *), top_bits_sort);
>> -
>> -        /* Limit the number of lines printed to the terminal height 
>> so the
>> -         * most important info (at the top) will stay on screen. */
>> -        max_lines = -1;
>> -        if (ioctl(0, TIOCGWINSZ, &ws) != -1)
>> -            max_lines = ws.ws_row - 6; /* exclude header lines */
>> -        if (max_lines >= num_instdone_bits)
>> -            max_lines = num_instdone_bits;
>> -
>> -        t2 = gettime();
>> -        elapsed_time += (t2 - t1) / 1000000.0;
>> -
>> -        if (interactive) {
>> -            printf("%s", clear_screen);
>> -            print_clock_info(pci_dev);
>> -
>> -            ring_print(&render_ring, last_samples_per_sec);
>> -            ring_print(&bsd_ring, last_samples_per_sec);
>> -            ring_print(&bsd6_ring, last_samples_per_sec);
>> -            ring_print(&blt_ring, last_samples_per_sec);
>> -
>> -            printf("\n%30s  %s\n", "task", "percent busy");
>> -            for (i = 0; i < max_lines; i++) {
>> -                if (top_bits_sorted[i]->count > 0) {
>> -                    percent = (top_bits_sorted[i]->count * 100) /
>> -                        last_samples_per_sec;
>> -                    len = printf("%30s: %3d%%: ",
>> -                             top_bits_sorted[i]->bit->name,
>> -                             percent);
>> -                    print_percentage_bar (percent, len);
>> -                } else {
>> -                    printf("%*s", PERCENTAGE_BAR_END, "");
>> -                }
>> -
>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -                    printf("%13s: %llu (%lld/sec)",
>> -                           stats_reg_names[i],
>> -                           (long long)stats[i],
>> -                           (long long)(stats[i] - last_stats[i]));
>> -                    last_stats[i] = stats[i];
>> -                } else {
>> -                    if (!top_bits_sorted[i]->count)
>> -                        break;
>> -                }
>> -                printf("\n");
>> -            }
>> -        }
>> -        if (output) {
>> -            /* Print headers for columns at first run */
>> -            if (print_headers) {
>> -                fprintf(output, "# time\t");
>> -                ring_print_header(output, &render_ring);
>> -                ring_print_header(output, &bsd_ring);
>> -                ring_print_header(output, &bsd6_ring);
>> -                ring_print_header(output, &blt_ring);
>> -                for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>> -                    if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -                        fprintf(output, "%.6s\t",
>> -                               stats_reg_names[i]
>> -                               );
>> -                    }
>> -                    if (!top_bits[i].count)
>> -                        continue;
>> -                }
>> -                fprintf(output, "\n");
>> -                print_headers = 0;
>> -            }
>> -
>> -            /* Print statistics */
>> -            fprintf(output, "%.2f\t", elapsed_time);
>> -            ring_log(&render_ring, last_samples_per_sec, output);
>> -            ring_log(&bsd_ring, last_samples_per_sec, output);
>> -            ring_log(&bsd6_ring, last_samples_per_sec, output);
>> -            ring_log(&blt_ring, last_samples_per_sec, output);
>> -
>> -            for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -                    fprintf(output, "%"PRIu64"\t",
>> -                           stats[i] - last_stats[i]);
>> -                    last_stats[i] = stats[i];
>> -                }
>> -                    if (!top_bits[i].count)
>> -                        continue;
>> -            }
>> -            fprintf(output, "\n");
>> -            fflush(output);
>> -        }
>> +            val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
>> +            val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
>> +            len = snprintf(buf, sizeof(buf),
>> +                       "%6.2f%% wait, %6.2f%% sema",
>> +                       val[0], val[1]);
>> -        for (i = 0; i < num_instdone_bits; i++) {
>> -            top_bits_sorted[i]->count = 0;
>> +            val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
>> +            len += printf("%8s %6.2f%% ",
>> +                      engine->name, val[0]);
>> +            print_percentage_bar(val[0], max_w - len);
>> -            if (i < STATS_COUNT)
>> -                last_stats[i] = stats[i];
>> -        }
>> +            printf("%s\n", buf);
>> -        /* Check if child has gone */
>> -        if (child_pid > 0) {
>> -            int res;
>> -            if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == 
>> -1) {
>> -                perror("waitpid");
>> -                exit(1);
>> -            }
>> -            if (res == 0)
>> -                continue;
>> -            if (WIFEXITED(child_stat))
>> -                break;
>> +            lines++;
>>           }
>> -    }
>> -    fclose(output);
>> +        printf("\n");
>> +
>> +        usleep(period_us);
>> +    }
>> -    intel_register_access_fini();
>>       return 0;
>>   }
>> diff --git a/tools/meson.build b/tools/meson.build
>> index bd2d313d5156..a918eeb0bef1 100644
>> --- a/tools/meson.build
>> +++ b/tools/meson.build
>> @@ -23,7 +23,6 @@ tools_progs = [
>>       'intel_gpu_frequency',
>>       'intel_firmware_decode',
>>       'intel_gpu_time',
>> -    'intel_gpu_top',
>>       'intel_gtt',
>>       'intel_guc_logger',
>>       'intel_infoframes',
>> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>>              name_prefix : '',
>>              install : true)
>> +executable('intel_gpu_top', 'intel_gpu_top.c',
>> +       install : true,
>> +       install_rpath : rpathdir,
>> +       dependencies : tool_deps + [ lib_igt_perf ])
>> +
>>   conf_data = configuration_data()
>>   conf_data.set('prefix', prefix)
>>   conf_data.set('exec_prefix', '${prefix}')
>>
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-03  9:36       ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03  9:36 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 29/03/2018 15:30, Eero Tamminen wrote:
> Hi,
> 
> I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
> with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
> and latest drm-tip build (4.16.0-rc7).
> 
> 
> General comments
> ----------------
> 
> This will be used by our customers and people who aren't necessarily
> familiar with i915 internal details.  Therefore it should use
> common terminology in the field and in similar tools, instead of
> I3As (Intel 3-letter Acronyms).
> 
> For example:
>   - rcs -> 3D render
>   - bcs -> blitter
>   - vecs -> video
>   - vcs -> video decode
> etc.

Done. And I am open to bike-shedding of the names and display format for 
instance reporting.

> 
> Old tool showed also GPU system memory interface (GAM) busyness.
> That was valuable info, and reasonably accurate for stable loads.
> 
> Could this tool show also either that information (preferred), or
> bandwidth utilized by GPU/CPU/display?
> 
> (Latest kernels offer GPU memory bandwidth usage through perf
> "uncore_imc" "data_reads" & "date_writes" counters.)

Excellent suggestion and I've added IMC data_reads and data_writes to 
the tool.

> 
> 
> Is "wait" value supposed to be IO-wait for given engine interface?
>
> I never saw that change from 0%, although IO-wait in top jumped
> from 0 to 20-30% with my test GPU load.

No, that is time spent in MI_WAIT_FOR_EVENT. I think not very used in 
current codebase.

> 
> HW specific test results
> ------------------------
> 
> BYT:
> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>    works fine.
> 
> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
> for the "wait" value.
> 
> I never saw blitter engine to do anything, but that's because
> modesetting uses just 3D pipeline, and because I couldn't get
> Intel DDX to work with rest of latest git version of X / 3D stack.

Thank you for testing this so thoroughly - this was really invaluable 
since I don't have access too such number of platforms. I've tried to 
fix all this in the latest version.

> 
> 
> 
> Kernel version support
> ----------------------
> 
> My HW specific testing above was with drm-tip kernel, but I did one test
> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
> backport) on KBL.  For that, the tool reported:
> "Failed to detect engines!"
> 
> Although the previous intel_gpu_top works fine with that kernel version.
> 
> Same happens also with Ubuntu 17.04 v4.13 kernel.
> 
> 
> -> If new version needs a certain kernel version, it should tell
>     which version is required.

Yep, at least 4.16 is needed so I have added this info to the error message.

Thanks again for testing it and when you find the time if you could do 
it once more with the latest version (on the problematic platforms) that 
would be much appreciated.

Regards,

Tvrtko

> 
> 
>      - Eero
> 
> On 29.03.2018 13:33, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
>> register access. This patch rewrites it to use only PMU.
>>
>> Only overall command streamer busyness and GPU global data such as power
>> and frequencies are included in this new version.
>>
>> For access to more GPU functional unit level data, an OA metric based 
>> tool
>> like gpu-top should be used instead.
>>
>> v2:
>>   * Sort engines by class and instance.
>>   * Do not wait for one sampling period to display something on screen.
>>   * Move code out of the asserts. (Rinat Ibragimov)
>>   * Continuously adapt to terminal size. (Rinat Ibgragimov)
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> Cc: Petri Latvala <petri.latvala@intel.com>
>> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
>> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
>> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
>> ---
>>   tools/Makefile.am     |    2 +
>>   tools/intel_gpu_top.c | 1009 
>> +++++++++++++++++++++----------------------------
>>   tools/meson.build     |    6 +-
>>   3 files changed, 441 insertions(+), 576 deletions(-)
>>
>> diff --git a/tools/Makefile.am b/tools/Makefile.am
>> index 09b6dbcc3ece..a0b016ddd7ff 100644
>> --- a/tools/Makefile.am
>> +++ b/tools/Makefile.am
>> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version 
>> -no-undefined
>>   intel_aubdump_la_SOURCES = aubdump.c
>>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
>> +
>>   bin_SCRIPTS = intel_aubdump
>>   CLEANFILES = $(bin_SCRIPTS)
>> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
>> index 098e6ce3ff86..94091d97c4a3 100644
>> --- a/tools/intel_gpu_top.c
>> +++ b/tools/intel_gpu_top.c
>> @@ -1,6 +1,5 @@
>>   /*
>> - * Copyright © 2007 Intel Corporation
>> - * Copyright © 2011 Intel Corporation
>> + * Copyright © 2018 Intel Corporation
>>    *
>>    * Permission is hereby granted, free of charge, to any person 
>> obtaining a
>>    * copy of this software and associated documentation files (the 
>> "Software"),
>> @@ -18,701 +17,561 @@
>>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>> EVENT SHALL
>>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING
>> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> - * DEALINGS IN THE SOFTWARE.
>> - *
>> - * Authors:
>> - *    Eric Anholt <eric@anholt.net>
>> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
>> - *
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
>> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>>    */
>> -#include "config.h"
>> -
>> -#include <inttypes.h>
>> -#include <unistd.h>
>> -#include <stdlib.h>
>>   #include <stdio.h>
>> -#include <err.h>
>> -#include <sys/ioctl.h>
>> -#include <sys/time.h>
>> -#include <sys/wait.h>
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +#include <stdint.h>
>> +#include <assert.h>
>>   #include <string.h>
>> -#ifdef HAVE_TERMIOS_H
>> -#include <termios.h>
>> -#endif
>> -#include "intel_io.h"
>> -#include "instdone.h"
>> -#include "intel_reg.h"
>> -#include "intel_chipset.h"
>> -#include "drmtest.h"
>> -
>> -#define  FORCEWAKE        0xA18C
>> -#define  FORCEWAKE_ACK        0x130090
>> -
>> -#define SAMPLES_PER_SEC             10000
>> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
>> -
>> -#define MAX_NUM_TOP_BITS            100
>> -
>> -#define HAS_STATS_REGS(devid)        IS_965(devid)
>> -
>> -struct top_bit {
>> -    struct instdone_bit *bit;
>> -    int count;
>> -} top_bits[MAX_NUM_TOP_BITS];
>> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
>> -
>> -static uint32_t instdone, instdone1;
>> -
>> -static const char *bars[] = {
>> -    " ",
>> -    "▏",
>> -    "▎",
>> -    "▍",
>> -    "▌",
>> -    "▋",
>> -    "▊",
>> -    "▉",
>> -    "█"
>> -};
>> +#include <ctype.h>
>> +#include <stdlib.h>
>> +#include <unistd.h>
>> +#include <sys/stat.h>
>> +#include <fcntl.h>
>> +#include <inttypes.h>
>> +#include <sys/ioctl.h>
>> +#include <errno.h>
>> +#include <math.h>
>> +#include <locale.h>
>> +
>> +#include "igt_perf.h"
>> -enum stats_counts {
>> -    IA_VERTICES,
>> -    IA_PRIMITIVES,
>> -    VS_INVOCATION,
>> -    GS_INVOCATION,
>> -    GS_PRIMITIVES,
>> -    CL_INVOCATION,
>> -    CL_PRIMITIVES,
>> -    PS_INVOCATION,
>> -    PS_DEPTH,
>> -    STATS_COUNT
>> +struct pmu_pair {
>> +    uint64_t cur;
>> +    uint64_t prev;
>>   };
>> -const uint32_t stats_regs[STATS_COUNT] = {
>> -    IA_VERTICES_COUNT_QW,
>> -    IA_PRIMITIVES_COUNT_QW,
>> -    VS_INVOCATION_COUNT_QW,
>> -    GS_INVOCATION_COUNT_QW,
>> -    GS_PRIMITIVES_COUNT_QW,
>> -    CL_INVOCATION_COUNT_QW,
>> -    CL_PRIMITIVES_COUNT_QW,
>> -    PS_INVOCATION_COUNT_QW,
>> -    PS_DEPTH_COUNT_QW,
>> +struct pmu_counter {
>> +    uint64_t config;
>> +    unsigned int idx;
>> +    struct pmu_pair val;
>>   };
>> -const char *stats_reg_names[STATS_COUNT] = {
>> -    "vert fetch",
>> -    "prim fetch",
>> -    "VS invocations",
>> -    "GS invocations",
>> -    "GS prims",
>> -    "CL invocations",
>> -    "CL prims",
>> -    "PS invocations",
>> -    "PS depth pass",
>> +struct engine {
>> +    const char *name;
>> +    struct pmu_counter busy;
>> +    struct pmu_counter wait;
>> +    struct pmu_counter sema;
>>   };
>> -uint64_t stats[STATS_COUNT];
>> -uint64_t last_stats[STATS_COUNT];
>> +struct engines {
>> +    unsigned int num_engines;
>> +    unsigned int num_counters;
>> +    DIR *root;
>> +    int fd;
>> +    struct pmu_pair ts;
>> -static unsigned long
>> -gettime(void)
>> -{
>> -    struct timeval t;
>> -    gettimeofday(&t, NULL);
>> -    return (t.tv_usec + (t.tv_sec * 1000000));
>> -}
>> +    int rapl_fd;
>> +    double rapl_scale;
>> -static int
>> -top_bits_sort(const void *a, const void *b)
>> +    struct pmu_counter freq_req;
>> +    struct pmu_counter freq_act;
>> +    struct pmu_counter irq;
>> +    struct pmu_counter rc6;
>> +    struct pmu_counter rapl;
>> +
>> +    struct engine engine;
>> +};
>> +
>> +static uint64_t
>> +get_pmu_config(int dirfd, const char *name, const char *counter)
>>   {
>> -    struct top_bit * const *bit_a = a;
>> -    struct top_bit * const *bit_b = b;
>> -    int a_count = (*bit_a)->count;
>> -    int b_count = (*bit_b)->count;
>> +    char buf[128], *p;
>> +    int fd, ret;
>> -    if (a_count < b_count)
>> -        return 1;
>> -    else if (a_count == b_count)
>> -        return 0;
>> -    else
>> +    ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
>> +    if (ret < 0 || ret == sizeof(buf))
>>           return -1;
>> -}
>> -static void
>> -update_idle_bit(struct top_bit *top_bit)
>> -{
>> -    uint32_t reg_val;
>> +    fd = openat(dirfd, buf, O_RDONLY);
>> +    if (fd < 0)
>> +        return -1;
>> -    if (top_bit->bit->reg == INSTDONE_1)
>> -        reg_val = instdone1;
>> -    else
>> -        reg_val = instdone;
>> +    ret = read(fd, buf, sizeof(buf));
>> +    close(fd);
>> +    if (ret <= 0)
>> +        return -1;
>> +
>> +    p = index(buf, '0');
>> +    if (!p)
>> +        return -1;
>> -    if ((reg_val & top_bit->bit->bit) == 0)
>> -        top_bit->count++;
>> +    return strtoul(p, NULL, 0);
>>   }
>> -static void
>> -print_clock(const char *name, int clock) {
>> -    if (clock == -1)
>> -        printf("%s clock: unknown", name);
>> +#define engine_ptr(engines, n) \
>> +    ((struct engine *)((unsigned char *)(&engines->engine) + (n) * 
>> sizeof(struct engine)))
>> +
>> +static int engine_cmp(const void *__a, const void *__b)
>> +{
>> +    const struct engine *a = (struct engine *)__a;
>> +    const struct engine *b = (struct engine *)__b;
>> +    int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>> +              I915_PMU_CLASS_SHIFT;
>> +    int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>> +              I915_PMU_CLASS_SHIFT;
>> +    int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>> +    int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>> +
>> +    if (class_a != class_b)
>> +        return class_a - class_b;
>>       else
>> -        printf("%s clock: %d Mhz", name, clock);
>> +        return instance_a - instance_b;
>>   }
>> -static int
>> -print_clock_info(struct pci_device *pci_dev)
>> +static struct engines *discover_engines(void)
>>   {
>> -    uint32_t devid = pci_dev->device_id;
>> -    uint16_t gcfgc;
>> +    const char *sysfs_root = "/sys/devices/i915/events";
>> +    struct engines *engines;
>> +    struct dirent *dent;
>> +    int ret = 0;
>> +    DIR *d;
>> -    if (IS_GM45(devid)) {
>> -        int core_clock = -1;
>> +    engines = malloc(sizeof(struct engines));
>> +    if (!engines)
>> +        return NULL;
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +    memset(engines, 0, sizeof(*engines));
>> -        switch (gcfgc & 0xf) {
>> -        case 8:
>> -            core_clock = 266;
>> -            break;
>> -        case 9:
>> -            core_clock = 320;
>> -            break;
>> -        case 11:
>> -            core_clock = 400;
>> -            break;
>> -        case 13:
>> -            core_clock = 533;
>> -            break;
>> -        }
>> -        print_clock("core", core_clock);
>> -    } else if (IS_965(devid) && IS_MOBILE(devid)) {
>> -        int render_clock = -1, sampler_clock = -1;
>> +    engines->num_engines = 0;
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +    d = opendir(sysfs_root);
>> +    if (!d)
>> +        return NULL;
>> -        switch (gcfgc & 0xf) {
>> -        case 2:
>> -            render_clock = 250; sampler_clock = 267;
>> -            break;
>> -        case 3:
>> -            render_clock = 320; sampler_clock = 333;
>> -            break;
>> -        case 4:
>> -            render_clock = 400; sampler_clock = 444;
>> -            break;
>> -        case 5:
>> -            render_clock = 500; sampler_clock = 533;
>> -            break;
>> -        }
>> -
>> -        print_clock("render", render_clock);
>> -        printf("  ");
>> -        print_clock("sampler", sampler_clock);
>> -    } else if (IS_945(devid) && IS_MOBILE(devid)) {
>> -        int render_clock = -1, display_clock = -1;
>> +    while ((dent = readdir(d)) != NULL) {
>> +        const char *endswith = "-busy";
>> +        const unsigned int endlen = strlen(endswith);
>> +        struct engine *engine =
>> +                engine_ptr(engines, engines->num_engines);
>> +        char buf[256];
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +        if (dent->d_type != DT_REG)
>> +            continue;
>> -        switch (gcfgc & 0x7) {
>> -        case 0:
>> -            render_clock = 166;
>> -            break;
>> -        case 1:
>> -            render_clock = 200;
>> -            break;
>> -        case 3:
>> -            render_clock = 250;
>> -            break;
>> -        case 5:
>> -            render_clock = 400;
>> +        if (strlen(dent->d_name) >= sizeof(buf)) {
>> +            ret = -1;
>>               break;
>>           }
>> -        switch (gcfgc & 0x70) {
>> -        case 0:
>> -            display_clock = 200;
>> -            break;
>> -        case 4:
>> -            display_clock = 320;
>> -            break;
>> -        }
>> -        if (gcfgc & (1 << 7))
>> -            display_clock = 133;
>> +        strcpy(buf, dent->d_name);
>> -        print_clock("render", render_clock);
>> -        printf("  ");
>> -        print_clock("display", display_clock);
>> -    } else if (IS_915(devid) && IS_MOBILE(devid)) {
>> -        int render_clock = -1, display_clock = -1;
>> +        /* xxxN-busy */
>> +        if (strlen(buf) < (endlen + 4))
>> +            continue;
>> +        if (strcmp(&buf[strlen(buf) - endlen], endswith))
>> +            continue;
>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +        memset(engine, 0, sizeof(*engine));
>> -        switch (gcfgc & 0x7) {
>> -        case 0:
>> -            render_clock = 160;
>> -            break;
>> -        case 1:
>> -            render_clock = 190;
>> -            break;
>> -        case 4:
>> -            render_clock = 333;
>> +        buf[strlen(buf) - endlen] = 0;
>> +        engine->name = strdup(buf);
>> +        if (!engine->name) {
>> +            ret = -1;
>>               break;
>>           }
>> -        if (gcfgc & (1 << 13))
>> -            render_clock = 133;
>> -        switch (gcfgc & 0x70) {
>> -        case 0:
>> -            display_clock = 190;
>> +        engine->busy.config = get_pmu_config(dirfd(d), engine->name,
>> +                             "busy");
>> +        if (engine->busy.config == -1) {
>> +            ret = -1;
>>               break;
>> -        case 4:
>> -            display_clock = 333;
>> +        }
>> +
>> +        engines->num_engines++;
>> +        engines = realloc(engines, sizeof(struct engines) +
>> +                  engines->num_engines * sizeof(struct engine));
>> +        if (!engines) {
>> +            ret = -ENOMEM;
>>               break;
>>           }
>> -        if (gcfgc & (1 << 7))
>> -            display_clock = 133;
>> +    }
>> +
>> +    if (ret)
>> +        free(engines);
>> +    else {
>> +        qsort(engine_ptr(engines, 0), engines->num_engines,
>> +              sizeof(struct engine), engine_cmp);
>> -        print_clock("render", render_clock);
>> -        printf("  ");
>> -        print_clock("display", display_clock);
>> +        engines->root = d;
>>       }
>> +    return ret == 0 ? engines : NULL;
>> +}
>> +
>> +static int
>> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
>> +{
>> +    int fd;
>> +    ssize_t ret;
>> +
>> +    fd = open(filename, O_RDONLY);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    ret = read(fd, buf, bufsize - 1);
>> +    close(fd);
>> +    if (ret < 1)
>> +        return -1;
>> +
>> +    buf[ret] = '\0';
>> -    printf("\n");
>> -    return -1;
>> +    return 0;
>>   }
>> -#define STATS_LEN (20)
>> -#define PERCENTAGE_BAR_END    (79 - STATS_LEN)
>> +static uint64_t filename_to_u64(const char *filename, int base)
>> +{
>> +    char buf[64], *b;
>> -static void
>> -print_percentage_bar(float percent, int cur_line_len)
>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>> +        return 0;
>> +
>> +    /*
>> +     * Handle both single integer and key=value formats by skipping
>> +     * leading non-digits.
>> +     */
>> +    b = buf;
>> +    while (*b && !isdigit(*b))
>> +        b++;
>> +
>> +    return strtoull(b, NULL, base);
>> +}
>> +
>> +static uint64_t rapl_type_id(void)
>>   {
>> -    int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
>> -    int bar_len = bar_avail_len * (percent + .5) / 100.0;
>> -    int i;
>> +    return filename_to_u64("/sys/devices/power/type", 10);
>> +}
>> -    for (i = bar_len; i >= 8; i -= 8) {
>> -        printf("%s", bars[8]);
>> -        cur_line_len++;
>> -    }
>> -    if (i) {
>> -        printf("%s", bars[i]);
>> -        cur_line_len++;
>> -    }
>> +static uint64_t rapl_gpu_power(void)
>> +{
>> +    return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
>> +}
>> +
>> +static double filename_to_double(const char *filename)
>> +{
>> +    char *oldlocale;
>> +    char buf[80];
>> +    double v;
>> +
>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>> +        return 0;
>> -    /* NB: We can't use a field width with utf8 so we manually
>> -    * guarantee a field with of 45 chars for any bar. */
>> -    printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
>> +    oldlocale = setlocale(LC_ALL, "C");
>> +    v = strtod(buf, NULL);
>> +    setlocale(LC_ALL, oldlocale);
>> +
>> +    return v;
>>   }
>> -struct ring {
>> -    const char *name;
>> -    uint32_t mmio;
>> -    int head, tail, size;
>> -    uint64_t full;
>> -    int idle;
>> -};
>> +static double rapl_gpu_power_scale(void)
>> +{
>> +    return 
>> filename_to_double("/sys/devices/power/events/energy-gpu.scale");
>> +}
>> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
>> +#define __open_pmu(engines, pmu, idx) \
>> +({ \
>> +    int fd__; \
>> +\
>> +    fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
>> +    if (fd__ >= 0) { \
>> +        if ((engines)->fd == -1) \
>> +            (engines)->fd = fd__; \
>> +        (pmu)->idx = (idx)++; \
>> +        (engines)->num_counters++; \
>> +    } \
>> +\
>> +    fd__; \
>> +})
>> +
>> +static int pmu_init(struct engines *engines)
>>   {
>> -    return INREG(ring->mmio + reg);
>> +    unsigned int idx = 0;
>> +    unsigned int i;
>> +    int fd;
>> +
>> +    engines->fd = -1;
>> +    engines->num_counters = 0;
>> +
>> +    engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
>> +    fd = __open_pmu(engines, &engines->freq_req, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
>> +    fd = __open_pmu(engines, &engines->freq_act, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    engines->irq.config = I915_PMU_INTERRUPTS;
>> +    fd = __open_pmu(engines, &engines->irq, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    engines->rc6.config = I915_PMU_RC6_RESIDENCY;
>> +    fd = __open_pmu(engines, &engines->rc6, idx);
>> +    if (fd < 0)
>> +        return -1;
>> +
>> +    for (i = 0; i < engines->num_engines; i++) {
>> +        struct engine *engine = engine_ptr(engines, i);
>> +        struct {
>> +            struct pmu_counter *pmu;
>> +            const char *counter;
>> +        } *cnt, counters[] = {
>> +            { .pmu = &engine->busy, .counter = "busy" },
>> +            { .pmu = &engine->wait, .counter = "wait" },
>> +            { .pmu = &engine->sema, .counter = "sema" },
>> +            { .pmu = NULL, .counter = NULL },
>> +        };
>> +
>> +        for (cnt = counters; cnt->pmu; cnt++) {
>> +            if (!cnt->pmu->config)
>> +                cnt->pmu->config =
>> +                    get_pmu_config(dirfd(engines->root),
>> +                               engine->name,
>> +                               cnt->counter);
>> +            fd = __open_pmu(engines, cnt->pmu, idx);
>> +            if (fd < 0)
>> +                return -1;
>> +        }
>> +    }
>> +
>> +    engines->rapl_scale = rapl_gpu_power_scale();
>> +    if (engines->rapl_scale != NAN)
>> +        engines->rapl_scale *= 1e3; /* from nano to micro */
>> +    engines->rapl.config = rapl_gpu_power();
>> +    engines->rapl_fd = igt_perf_open(rapl_type_id(), 
>> engines->rapl.config);
>> +    if (engines->rapl_fd < 0)
>> +        return -1;
>> +
>> +    return 0;
>>   }
>> -static void ring_init(struct ring *ring)
>> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>>   {
>> -    ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) 
>> + 1) * 4096;
>> +    uint64_t buf[2 + num];
>> +    unsigned int i;
>> +    ssize_t len;
>> +
>> +    memset(buf, 0, sizeof(buf));
>> +
>> +    len = read(fd, buf, sizeof(buf));
>> +    assert(len == sizeof(buf));
>> +
>> +    for (i = 0; i < num; i++)
>> +        val[i] = buf[2 + i];
>> +
>> +    return buf[1];
>>   }
>> -static void ring_reset(struct ring *ring)
>> +static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
>>   {
>> -    ring->idle = ring->full = 0;
>> +    double pct;
>> +
>> +    pct = p->cur - p->prev;
>> +    pct /= d;
>> +    pct /= t;
>> +    pct *= s;
>> +
>> +    if (s == 100.0 && pct > 100.0)
>> +        pct = 100.0;
>> +
>> +    return pct;
>>   }
>> -static void ring_sample(struct ring *ring)
>> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>>   {
>> -    int full;
>> +    uint64_t data[2] = { };
>> +    ssize_t len;
>> -    if (!ring->size)
>> -        return;
>> +    len = read(fd, data, sizeof(data));
>> +    assert(len == sizeof(data));
>> -    ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
>> -    ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
>> +    if (ts)
>> +        *ts = data[1];
>> +
>> +    return data[0];
>> +}
>> -    if (ring->tail == ring->head)
>> -        ring->idle++;
>> +static uint64_t pmu_read_single(int fd)
>> +{
>> +    return __pmu_read_single(fd, NULL);
>> +}
>> -    full = ring->tail - ring->head;
>> -    if (full < 0)
>> -        full += ring->size;
>> -    ring->full += full;
>> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>> +{
>> +    counter->val.prev = counter->val.cur;
>> +    counter->val.cur = val;
>>   }
>> -static void ring_print_header(FILE *out, struct ring *ring)
>> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>>   {
>> -    fprintf(out, "%.6s%%\tops\t",
>> -            ring->name
>> -          );
>> +    __update_sample(counter, val[counter->idx]);
>>   }
>> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
>> +static void pmu_sample(struct engines *engines)
>>   {
>> -    int percent_busy, len;
>> +    const int num_val = engines->num_counters;
>> +    uint64_t val[num_val];
>> +    unsigned int i;
>> +
>> +    engines->ts.prev = engines->ts.cur;
>> +    engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>> +
>> +    __update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>> -    if (!ring->size)
>> -        return;
>> +    update_sample(&engines->freq_req, val);
>> +    update_sample(&engines->freq_act, val);
>> +    update_sample(&engines->irq, val);
>> +    update_sample(&engines->rc6, val);
>> -    percent_busy = 100 - 100 * ring->idle / samples_per_sec;
>> +    for (i = 0; i < engines->num_engines; i++) {
>> +        struct engine *engine = engine_ptr(engines, i);
>> -    len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
>> -    print_percentage_bar (percent_busy, len);
>> -    printf("%24s space: %d/%d\n",
>> -           ring->name,
>> -           (int)(ring->full / samples_per_sec),
>> -           ring->size);
>> +        update_sample(&engine->busy, val);
>> +        update_sample(&engine->sema, val);
>> +        update_sample(&engine->wait, val);
>> +    }
>>   }
>> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
>> -        FILE *output)
>> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", 
>> "█" };
>> +
>> +static void
>> +print_percentage_bar(double percent, int max_len)
>>   {
>> -    if (ring->size)
>> -        fprintf(output, "%3d\t%d\t",
>> -            (int)(100 - 100 * ring->idle / samples_per_sec),
>> -            (int)(ring->full / samples_per_sec));
>> -    else
>> -        fprintf(output, "-1\t-1\t");
>> +    int bar_len = percent * (8 * (max_len - 2)) / 100.0;
>> +    int i;
>> +
>> +    putchar('|');
>> +
>> +    for (i = bar_len; i >= 8; i -= 8)
>> +        printf("%s", bars[8]);
>> +    if (i)
>> +        printf("%s", bars[i]);
>> +
>> +    for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
>> +        putchar(' ');
>> +
>> +    putchar('|');
>>   }
>> +#define DEFAULT_PERIOD_MS (1000)
>> +
>>   static void
>>   usage(const char *appname)
>>   {
>>       printf("intel_gpu_top - Display a top-like summary of Intel GPU 
>> usage\n"
>> -            "\n"
>> -            "usage: %s [parameters]\n"
>> -            "\n"
>> -            "The following parameters apply:\n"
>> -            "[-s <samples>]       samples per seconds (default %d)\n"
>> -            "[-e <command>]       command to profile\n"
>> -            "[-o <file>]          output statistics to file. If file 
>> is '-',"
>> -            "                     run in batch mode and output 
>> statistics to stdio only \n"
>> -            "[-h]                 show this help screen\n"
>> -            "\n",
>> -            appname,
>> -            SAMPLES_PER_SEC
>> -          );
>> -    return;
>> +        "\n"
>> +        "Usage: %s [parameters]\n"
>> +        "\n"
>> +        "\tThe following parameters are optional:\n"
>> +        "\t[-s <samples>]       refresh period in ms (default %ums)\n"
>> +        "\t[-h]                 show this help text\n"
>> +        "\n",
>> +        appname, DEFAULT_PERIOD_MS);
>>   }
>>   int main(int argc, char **argv)
>>   {
>> -    uint32_t devid;
>> -    struct pci_device *pci_dev;
>> -    struct ring render_ring = {
>> -        .name = "render",
>> -        .mmio = 0x2030,
>> -    }, bsd_ring = {
>> -        .name = "bitstream",
>> -        .mmio = 0x4030,
>> -    }, bsd6_ring = {
>> -        .name = "bitstream",
>> -        .mmio = 0x12030,
>> -    }, blt_ring = {
>> -        .name = "blitter",
>> -        .mmio = 0x22030,
>> -    };
>> -    int i, ch;
>> -    int samples_per_sec = SAMPLES_PER_SEC;
>> -    FILE *output = NULL;
>> -    double elapsed_time=0;
>> -    int print_headers=1;
>> -    pid_t child_pid=-1;
>> -    int child_stat;
>> -    char *cmd=NULL;
>> -    int interactive=1;
>> -
>> -    /* Parse options? */
>> -    while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
>> +    unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
>> +    int con_w = -1, con_h = -1;
>> +    struct engines *engines;
>> +    unsigned int i;
>> +    int ret, ch;
>> +
>> +    /* Parse options */
>> +    while ((ch = getopt(argc, argv, "s:h")) != -1) {
>>           switch (ch) {
>> -        case 'e': cmd = strdup(optarg);
>> -            break;
>> -        case 's': samples_per_sec = atoi(optarg);
>> -            if (samples_per_sec < 100) {
>> -                fprintf(stderr, "Error: samples per second must be >= 
>> 100\n");
>> -                exit(1);
>> -            }
>> -            break;
>> -        case 'o':
>> -            if (!strcmp(optarg, "-")) {
>> -                /* Running in non-interactive mode */
>> -                interactive = 0;
>> -                output = stdout;
>> -            }
>> -            else
>> -                output = fopen(optarg, "w");
>> -            if (!output)
>> -            {
>> -                perror("fopen");
>> -                exit(1);
>> -            }
>> +        case 's':
>> +            period_us = atoi(optarg) * 1000;
>>               break;
>>           case 'h':
>>               usage(argv[0]);
>>               exit(0);
>> -            break;
>>           default:
>> -            fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
>> +            fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>>               usage(argv[0]);
>>               exit(1);
>> -            break;
>>           }
>>       }
>> -    pci_dev = intel_get_pci_device();
>> -    devid = pci_dev->device_id;
>> -    intel_mmio_use_pci_bar(pci_dev);
>> -    init_instdone_definitions(devid);
>> -
>> -    /* Do we have a command to run? */
>> -    if (cmd != NULL) {
>> -        if (output) {
>> -            fprintf(output, "# Profiling: %s\n", cmd);
>> -            fflush(output);
>> -        }
>> -        child_pid = fork();
>> -        if (child_pid < 0) {
>> -            perror("fork");
>> -            exit(1);
>> -        }
>> -        else if (child_pid == 0) {
>> -            int res;
>> -            res = system(cmd);
>> -            if (res < 0)
>> -                perror("running command");
>> -            if (output) {
>> -                fflush(output);
>> -                fprintf(output, "# %s exited with status %d\n", cmd, 
>> res);
>> -                fflush(output);
>> -            }
>> -            free(cmd);
>> -            exit(0);
>> -        } else {
>> -            free(cmd);
>> -        }
>> +    engines = discover_engines();
>> +    if (!engines) {
>> +        fprintf(stderr, "Failed to detect engines!\n");
>> +        return 1;
>>       }
>> -    for (i = 0; i < num_instdone_bits; i++) {
>> -        top_bits[i].bit = &instdone_bits[i];
>> -        top_bits[i].count = 0;
>> -        top_bits_sorted[i] = &top_bits[i];
>> +    ret = pmu_init(engines);
>> +    if (ret) {
>> +        fprintf(stderr, "Failed to initialize PMU!\n");
>> +        return 1;
>>       }
>> -    /* Grab access to the registers */
>> -    intel_register_access_init(pci_dev, 0, -1);
>> +    pmu_sample(engines);
>> -    ring_init(&render_ring);
>> -    if (IS_GEN4(devid) || IS_GEN5(devid))
>> -        ring_init(&bsd_ring);
>> -    if (IS_GEN6(devid) || IS_GEN7(devid)) {
>> -        ring_init(&bsd6_ring);
>> -        ring_init(&blt_ring);
>> -    }
>> +    for (;;) {
>> +        double t, freq[2], irq, rc6, power;
>> +        struct winsize ws;
>> +        int lines = 0;
>> -    /* Initialize GPU stats */
>> -    if (HAS_STATS_REGS(devid)) {
>> -        for (i = 0; i < STATS_COUNT; i++) {
>> -            uint32_t stats_high, stats_low, stats_high_2;
>> +        /* Update terminal size. */
>> +        if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>> +            con_w = ws.ws_col;
>> +            con_h = ws.ws_row;
>> +        }
>> -            do {
>> -                stats_high = INREG(stats_regs[i] + 4);
>> -                stats_low = INREG(stats_regs[i]);
>> -                stats_high_2 = INREG(stats_regs[i] + 4);
>> -            } while (stats_high != stats_high_2);
>> +        pmu_sample(engines);
>> +        t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>> -            last_stats[i] = (uint64_t)stats_high << 32 |
>> -                stats_low;
>> -        }
>> -    }
>> +        printf("\033[H\033[J");
>> -    for (;;) {
>> -        int j;
>> -        unsigned long long t1, ti, tf, t2;
>> -        unsigned long long def_sleep = 1000000 / samples_per_sec;
>> -        unsigned long long last_samples_per_sec = samples_per_sec;
>> -        unsigned short int max_lines;
>> -        struct winsize ws;
>> -        char clear_screen[] = {0x1b, '[', 'H',
>> -                       0x1b, '[', 'J',
>> -                       0x0};
>> -        int percent;
>> -        int len;
>> -
>> -        t1 = gettime();
>> -
>> -        ring_reset(&render_ring);
>> -        ring_reset(&bsd_ring);
>> -        ring_reset(&bsd6_ring);
>> -        ring_reset(&blt_ring);
>> -
>> -        for (i = 0; i < samples_per_sec; i++) {
>> -            long long interval;
>> -            ti = gettime();
>> -            if (IS_965(devid)) {
>> -                instdone = INREG(INSTDONE_I965);
>> -                instdone1 = INREG(INSTDONE_1);
>> -            } else
>> -                instdone = INREG(INSTDONE);
>> -
>> -            for (j = 0; j < num_instdone_bits; j++)
>> -                update_idle_bit(&top_bits[j]);
>> -
>> -            ring_sample(&render_ring);
>> -            ring_sample(&bsd_ring);
>> -            ring_sample(&bsd6_ring);
>> -            ring_sample(&blt_ring);
>> -
>> -            tf = gettime();
>> -            if (tf - t1 >= 1000000) {
>> -                /* We are out of sync, bail out */
>> -                last_samples_per_sec = i+1;
>> -                break;
>> -            }
>> -            interval = def_sleep - (tf - ti);
>> -            if (interval > 0)
>> -                usleep(interval);
>> -        }
>> +        freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
>> +        freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
>> +        irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
>> +        rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
>> +        power = pmu_calc(&engines->rapl.val, 1.0, t,
>> +                 engines->rapl_scale);
>> -        if (HAS_STATS_REGS(devid)) {
>> -            for (i = 0; i < STATS_COUNT; i++) {
>> -                uint32_t stats_high, stats_low, stats_high_2;
>> +        printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; 
>> %6.0fmW; %8.0f irqs/s\n",
>> +               freq[0], freq[1], rc6, power, irq);
>> +        lines++;
>> -                do {
>> -                    stats_high = INREG(stats_regs[i] + 4);
>> -                    stats_low = INREG(stats_regs[i]);
>> -                    stats_high_2 = INREG(stats_regs[i] + 4);
>> -                } while (stats_high != stats_high_2);
>> +        printf("\n");
>> +        lines++;
>> -                stats[i] = (uint64_t)stats_high << 32 |
>> -                    stats_low;
>> -            }
>> -        }
>> +        for (i = 0; i < engines->num_engines && lines < con_h; i++) {
>> +            struct engine *engine = engine_ptr(engines, i);
>> +            unsigned int max_w = con_w - 1;
>> +            unsigned int len;
>> +            double val[2];
>> +            char buf[128];
>> -        qsort(top_bits_sorted, num_instdone_bits,
>> -              sizeof(struct top_bit *), top_bits_sort);
>> -
>> -        /* Limit the number of lines printed to the terminal height 
>> so the
>> -         * most important info (at the top) will stay on screen. */
>> -        max_lines = -1;
>> -        if (ioctl(0, TIOCGWINSZ, &ws) != -1)
>> -            max_lines = ws.ws_row - 6; /* exclude header lines */
>> -        if (max_lines >= num_instdone_bits)
>> -            max_lines = num_instdone_bits;
>> -
>> -        t2 = gettime();
>> -        elapsed_time += (t2 - t1) / 1000000.0;
>> -
>> -        if (interactive) {
>> -            printf("%s", clear_screen);
>> -            print_clock_info(pci_dev);
>> -
>> -            ring_print(&render_ring, last_samples_per_sec);
>> -            ring_print(&bsd_ring, last_samples_per_sec);
>> -            ring_print(&bsd6_ring, last_samples_per_sec);
>> -            ring_print(&blt_ring, last_samples_per_sec);
>> -
>> -            printf("\n%30s  %s\n", "task", "percent busy");
>> -            for (i = 0; i < max_lines; i++) {
>> -                if (top_bits_sorted[i]->count > 0) {
>> -                    percent = (top_bits_sorted[i]->count * 100) /
>> -                        last_samples_per_sec;
>> -                    len = printf("%30s: %3d%%: ",
>> -                             top_bits_sorted[i]->bit->name,
>> -                             percent);
>> -                    print_percentage_bar (percent, len);
>> -                } else {
>> -                    printf("%*s", PERCENTAGE_BAR_END, "");
>> -                }
>> -
>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -                    printf("%13s: %llu (%lld/sec)",
>> -                           stats_reg_names[i],
>> -                           (long long)stats[i],
>> -                           (long long)(stats[i] - last_stats[i]));
>> -                    last_stats[i] = stats[i];
>> -                } else {
>> -                    if (!top_bits_sorted[i]->count)
>> -                        break;
>> -                }
>> -                printf("\n");
>> -            }
>> -        }
>> -        if (output) {
>> -            /* Print headers for columns at first run */
>> -            if (print_headers) {
>> -                fprintf(output, "# time\t");
>> -                ring_print_header(output, &render_ring);
>> -                ring_print_header(output, &bsd_ring);
>> -                ring_print_header(output, &bsd6_ring);
>> -                ring_print_header(output, &blt_ring);
>> -                for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>> -                    if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -                        fprintf(output, "%.6s\t",
>> -                               stats_reg_names[i]
>> -                               );
>> -                    }
>> -                    if (!top_bits[i].count)
>> -                        continue;
>> -                }
>> -                fprintf(output, "\n");
>> -                print_headers = 0;
>> -            }
>> -
>> -            /* Print statistics */
>> -            fprintf(output, "%.2f\t", elapsed_time);
>> -            ring_log(&render_ring, last_samples_per_sec, output);
>> -            ring_log(&bsd_ring, last_samples_per_sec, output);
>> -            ring_log(&bsd6_ring, last_samples_per_sec, output);
>> -            ring_log(&blt_ring, last_samples_per_sec, output);
>> -
>> -            for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -                    fprintf(output, "%"PRIu64"\t",
>> -                           stats[i] - last_stats[i]);
>> -                    last_stats[i] = stats[i];
>> -                }
>> -                    if (!top_bits[i].count)
>> -                        continue;
>> -            }
>> -            fprintf(output, "\n");
>> -            fflush(output);
>> -        }
>> +            val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
>> +            val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
>> +            len = snprintf(buf, sizeof(buf),
>> +                       "%6.2f%% wait, %6.2f%% sema",
>> +                       val[0], val[1]);
>> -        for (i = 0; i < num_instdone_bits; i++) {
>> -            top_bits_sorted[i]->count = 0;
>> +            val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
>> +            len += printf("%8s %6.2f%% ",
>> +                      engine->name, val[0]);
>> +            print_percentage_bar(val[0], max_w - len);
>> -            if (i < STATS_COUNT)
>> -                last_stats[i] = stats[i];
>> -        }
>> +            printf("%s\n", buf);
>> -        /* Check if child has gone */
>> -        if (child_pid > 0) {
>> -            int res;
>> -            if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == 
>> -1) {
>> -                perror("waitpid");
>> -                exit(1);
>> -            }
>> -            if (res == 0)
>> -                continue;
>> -            if (WIFEXITED(child_stat))
>> -                break;
>> +            lines++;
>>           }
>> -    }
>> -    fclose(output);
>> +        printf("\n");
>> +
>> +        usleep(period_us);
>> +    }
>> -    intel_register_access_fini();
>>       return 0;
>>   }
>> diff --git a/tools/meson.build b/tools/meson.build
>> index bd2d313d5156..a918eeb0bef1 100644
>> --- a/tools/meson.build
>> +++ b/tools/meson.build
>> @@ -23,7 +23,6 @@ tools_progs = [
>>       'intel_gpu_frequency',
>>       'intel_firmware_decode',
>>       'intel_gpu_time',
>> -    'intel_gpu_top',
>>       'intel_gtt',
>>       'intel_guc_logger',
>>       'intel_infoframes',
>> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>>              name_prefix : '',
>>              install : true)
>> +executable('intel_gpu_top', 'intel_gpu_top.c',
>> +       install : true,
>> +       install_rpath : rpathdir,
>> +       dependencies : tool_deps + [ lib_igt_perf ])
>> +
>>   conf_data = configuration_data()
>>   conf_data.set('prefix', prefix)
>>   conf_data.set('exec_prefix', '${prefix}')
>>
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v3] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-30 19:15       ` Rinat Ibragimov
@ 2018-04-03  9:38           ` Tvrtko Ursulin
  2018-04-03  9:38           ` Tvrtko Ursulin
  1 sibling, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03  9:38 UTC (permalink / raw)
  To: Rinat Ibragimov, Tvrtko Ursulin; +Cc: igt-dev, Eero Tamminen, Intel-gfx


On 30/03/2018 20:15, Rinat Ibragimov wrote:
> 
> 
>> Четверг, 29 марта 2018, 21:46 +03:00 от Tvrtko Ursulin <tursulin@ursulin.net>:
>>
> 
>> +#define engine_ptr(engines, n) \
>> +((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
> 
> I think (&engines->engine + (n)) is easier to read.

Absolutely agreed.

>> +if (fd < 0 && !cnt->optional)
>> +return -1;
> 
> I've tried to run it on Skylake on Linux 4.16, and intel_gpu_top is working, as long as
> I remove these lines. Otherwise it fails while trying "vcs1". Error message says about
> Linux 4.16, which is a bit confusing.
> 
> There are code that sets and tests "present" field of struct pmu_counter. So,
> I guess, it's fine to remove the code, and thus make all counters optional?
> 
>> +
>> +if (!cnt->present) {
>> +strncpy(buf, "---", bufsz);
>>   return;
>> +}
> 
> If you decide to make all counters optional, this will be used for "busy" numbers
> too. But "busy" is 6 characters wide, unlike "sema" and "wait", which are 3 each.

Yep I failed to implement this correctly. Fixed in v4 hopefully.

Definitely thanks a lot for this and previous feedback!

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v3] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-03  9:38           ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03  9:38 UTC (permalink / raw)
  To: Rinat Ibragimov, Tvrtko Ursulin
  Cc: igt-dev, Eero Tamminen, Intel-gfx, Tvrtko Ursulin


On 30/03/2018 20:15, Rinat Ibragimov wrote:
> 
> 
>> Четверг, 29 марта 2018, 21:46 +03:00 от Tvrtko Ursulin <tursulin@ursulin.net>:
>>
> 
>> +#define engine_ptr(engines, n) \
>> +((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
> 
> I think (&engines->engine + (n)) is easier to read.

Absolutely agreed.

>> +if (fd < 0 && !cnt->optional)
>> +return -1;
> 
> I've tried to run it on Skylake on Linux 4.16, and intel_gpu_top is working, as long as
> I remove these lines. Otherwise it fails while trying "vcs1". Error message says about
> Linux 4.16, which is a bit confusing.
> 
> There are code that sets and tests "present" field of struct pmu_counter. So,
> I guess, it's fine to remove the code, and thus make all counters optional?
> 
>> +
>> +if (!cnt->present) {
>> +strncpy(buf, "---", bufsz);
>>   return;
>> +}
> 
> If you decide to make all counters optional, this will be used for "busy" numbers
> too. But "busy" is 6 characters wide, unlike "sema" and "wait", which are 3 each.

Yep I failed to implement this correctly. Fixed in v4 hopefully.

Definitely thanks a lot for this and previous feedback!

Regards,

Tvrtko
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-03  9:36       ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-04-03 14:06         ` Eero Tamminen
  -1 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-04-03 14:06 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Hi,

On 03.04.2018 12:36, Tvrtko Ursulin wrote:
> On 29/03/2018 15:30, Eero Tamminen wrote:
>> I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
>> with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
>> and latest drm-tip build (4.16.0-rc7).
>>
>>
>> General comments
>> ----------------
>>
>> This will be used by our customers and people who aren't necessarily
>> familiar with i915 internal details.  Therefore it should use
>> common terminology in the field and in similar tools, instead of
>> I3As (Intel 3-letter Acronyms).
>>
>> For example:
>>   - rcs -> 3D render
>>   - bcs -> blitter
>>   - vecs -> video
>>   - vcs -> video decode
>> etc.
> 
> Done. And I am open to bike-shedding of the names and display format for 
> instance reporting.

New names look fine to me!


>> Old tool showed also GPU system memory interface (GAM) busyness.
>> That was valuable info, and reasonably accurate for stable loads.
>>
>> Could this tool show also either that information (preferred), or
>> bandwidth utilized by GPU/CPU/display?
>>
>> (Latest kernels offer GPU memory bandwidth usage through perf
>> "uncore_imc" "data_reads" & "date_writes" counters.)
> 
> Excellent suggestion and I've added IMC data_reads and data_writes to 
> the tool.

Thanks, it looks fine too.  I'm just wondering about the numbers
it's reporting on SKL GT2...

AFAIK IMC counters are for uncore, so I though that they should
correspond to GTI (memory interface to outside of GPU) read and
write HW counter values.  While it seemed in some cases quite close,
in some cases the it showed a lot smaller (2/3) value than expected.

I can understand why reads are sometimes larger, because I think
uncore will include also display engine display content reads.

However, I don't see how uncore writes could be considerably smaller
than the GTI interface write amount.

(GTI interface reports the expected value which corresponds directly
to what my test application is doing (64x blended FullHD layer writes).)

Idle machine read amounts are also much smaller (60-65MB/s) than what
I think display update read should be (1920*1080*4*60Hz = 475MiB/s).

Any ideas for these two discrepancies?


>> Is "wait" value supposed to be IO-wait for given engine interface?
>>
>> I never saw that change from 0%, although IO-wait in top jumped
>> from 0 to 20-30% with my test GPU load.
> 
> No, that is time spent in MI_WAIT_FOR_EVENT.

Could you add that info to the UI?

E.g. just have "MI" on top of the "wait" column.


 > I think not very used in current codebase.

What you're using to validate that it reports correct value?


>> HW specific test results
>> ------------------------
>>
>> BYT:
>> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>>    works fine.
>>
>> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
>> for the "wait" value.
>>
>> I never saw blitter engine to do anything, but that's because
>> modesetting uses just 3D pipeline, and because I couldn't get
>> Intel DDX to work with rest of latest git version of X / 3D stack.
> 
> Thank you for testing this so thoroughly - this was really invaluable 
> since I don't have access too such number of platforms. I've tried to 
> fix all this in the latest version.

Machines are currently running tests, I'll check these tomorrow.


>> Kernel version support
>> ----------------------
>>
>> My HW specific testing above was with drm-tip kernel, but I did one test
>> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
>> backport) on KBL.  For that, the tool reported:
>> "Failed to detect engines!"
>>
>> Although the previous intel_gpu_top works fine with that kernel version.
>>
>> Same happens also with Ubuntu 17.04 v4.13 kernel.
>>
>>
>> -> If new version needs a certain kernel version, it should tell
>>     which version is required.
> 
> Yep, at least 4.16 is needed so I have added this info to the error 
> message.

IMHO the message is a bit ambivalent:
	Failed to detect engines! Kernel 4.16 or newer?

I would suggest checking whether kernel is new enough, and if not:
	Kernel X.YY detected, 4.16 or newer required.


	- Eero

> Thanks again for testing it and when you find the time if you could do 
> it once more with the latest version (on the problematic platforms) that 
> would be much appreciated.
> 
> Regards,
> 
> Tvrtko
> 
>>
>>
>>      - Eero
>>
>> On 29.03.2018 13:33, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>> intel-gpu-top is a dangerous tool which can hang machines due unsafe 
>>> mmio
>>> register access. This patch rewrites it to use only PMU.
>>>
>>> Only overall command streamer busyness and GPU global data such as power
>>> and frequencies are included in this new version.
>>>
>>> For access to more GPU functional unit level data, an OA metric based 
>>> tool
>>> like gpu-top should be used instead.
>>>
>>> v2:
>>>   * Sort engines by class and instance.
>>>   * Do not wait for one sampling period to display something on screen.
>>>   * Move code out of the asserts. (Rinat Ibragimov)
>>>   * Continuously adapt to terminal size. (Rinat Ibgragimov)
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>> Cc: Petri Latvala <petri.latvala@intel.com>
>>> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
>>> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
>>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
>>> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
>>> ---
>>>   tools/Makefile.am     |    2 +
>>>   tools/intel_gpu_top.c | 1009 
>>> +++++++++++++++++++++----------------------------
>>>   tools/meson.build     |    6 +-
>>>   3 files changed, 441 insertions(+), 576 deletions(-)
>>>
>>> diff --git a/tools/Makefile.am b/tools/Makefile.am
>>> index 09b6dbcc3ece..a0b016ddd7ff 100644
>>> --- a/tools/Makefile.am
>>> +++ b/tools/Makefile.am
>>> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version 
>>> -no-undefined
>>>   intel_aubdump_la_SOURCES = aubdump.c
>>>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>>> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
>>> +
>>>   bin_SCRIPTS = intel_aubdump
>>>   CLEANFILES = $(bin_SCRIPTS)
>>> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
>>> index 098e6ce3ff86..94091d97c4a3 100644
>>> --- a/tools/intel_gpu_top.c
>>> +++ b/tools/intel_gpu_top.c
>>> @@ -1,6 +1,5 @@
>>>   /*
>>> - * Copyright © 2007 Intel Corporation
>>> - * Copyright © 2011 Intel Corporation
>>> + * Copyright © 2018 Intel Corporation
>>>    *
>>>    * Permission is hereby granted, free of charge, to any person 
>>> obtaining a
>>>    * copy of this software and associated documentation files (the 
>>> "Software"),
>>> @@ -18,701 +17,561 @@
>>>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>>> EVENT SHALL
>>>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
>>> DAMAGES OR OTHER
>>>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>>> ARISING
>>> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>>> - * DEALINGS IN THE SOFTWARE.
>>> - *
>>> - * Authors:
>>> - *    Eric Anholt <eric@anholt.net>
>>> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
>>> - *
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
>>> OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>>    */
>>> -#include "config.h"
>>> -
>>> -#include <inttypes.h>
>>> -#include <unistd.h>
>>> -#include <stdlib.h>
>>>   #include <stdio.h>
>>> -#include <err.h>
>>> -#include <sys/ioctl.h>
>>> -#include <sys/time.h>
>>> -#include <sys/wait.h>
>>> +#include <sys/types.h>
>>> +#include <dirent.h>
>>> +#include <stdint.h>
>>> +#include <assert.h>
>>>   #include <string.h>
>>> -#ifdef HAVE_TERMIOS_H
>>> -#include <termios.h>
>>> -#endif
>>> -#include "intel_io.h"
>>> -#include "instdone.h"
>>> -#include "intel_reg.h"
>>> -#include "intel_chipset.h"
>>> -#include "drmtest.h"
>>> -
>>> -#define  FORCEWAKE        0xA18C
>>> -#define  FORCEWAKE_ACK        0x130090
>>> -
>>> -#define SAMPLES_PER_SEC             10000
>>> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
>>> -
>>> -#define MAX_NUM_TOP_BITS            100
>>> -
>>> -#define HAS_STATS_REGS(devid)        IS_965(devid)
>>> -
>>> -struct top_bit {
>>> -    struct instdone_bit *bit;
>>> -    int count;
>>> -} top_bits[MAX_NUM_TOP_BITS];
>>> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
>>> -
>>> -static uint32_t instdone, instdone1;
>>> -
>>> -static const char *bars[] = {
>>> -    " ",
>>> -    "▏",
>>> -    "▎",
>>> -    "▍",
>>> -    "▌",
>>> -    "▋",
>>> -    "▊",
>>> -    "▉",
>>> -    "█"
>>> -};
>>> +#include <ctype.h>
>>> +#include <stdlib.h>
>>> +#include <unistd.h>
>>> +#include <sys/stat.h>
>>> +#include <fcntl.h>
>>> +#include <inttypes.h>
>>> +#include <sys/ioctl.h>
>>> +#include <errno.h>
>>> +#include <math.h>
>>> +#include <locale.h>
>>> +
>>> +#include "igt_perf.h"
>>> -enum stats_counts {
>>> -    IA_VERTICES,
>>> -    IA_PRIMITIVES,
>>> -    VS_INVOCATION,
>>> -    GS_INVOCATION,
>>> -    GS_PRIMITIVES,
>>> -    CL_INVOCATION,
>>> -    CL_PRIMITIVES,
>>> -    PS_INVOCATION,
>>> -    PS_DEPTH,
>>> -    STATS_COUNT
>>> +struct pmu_pair {
>>> +    uint64_t cur;
>>> +    uint64_t prev;
>>>   };
>>> -const uint32_t stats_regs[STATS_COUNT] = {
>>> -    IA_VERTICES_COUNT_QW,
>>> -    IA_PRIMITIVES_COUNT_QW,
>>> -    VS_INVOCATION_COUNT_QW,
>>> -    GS_INVOCATION_COUNT_QW,
>>> -    GS_PRIMITIVES_COUNT_QW,
>>> -    CL_INVOCATION_COUNT_QW,
>>> -    CL_PRIMITIVES_COUNT_QW,
>>> -    PS_INVOCATION_COUNT_QW,
>>> -    PS_DEPTH_COUNT_QW,
>>> +struct pmu_counter {
>>> +    uint64_t config;
>>> +    unsigned int idx;
>>> +    struct pmu_pair val;
>>>   };
>>> -const char *stats_reg_names[STATS_COUNT] = {
>>> -    "vert fetch",
>>> -    "prim fetch",
>>> -    "VS invocations",
>>> -    "GS invocations",
>>> -    "GS prims",
>>> -    "CL invocations",
>>> -    "CL prims",
>>> -    "PS invocations",
>>> -    "PS depth pass",
>>> +struct engine {
>>> +    const char *name;
>>> +    struct pmu_counter busy;
>>> +    struct pmu_counter wait;
>>> +    struct pmu_counter sema;
>>>   };
>>> -uint64_t stats[STATS_COUNT];
>>> -uint64_t last_stats[STATS_COUNT];
>>> +struct engines {
>>> +    unsigned int num_engines;
>>> +    unsigned int num_counters;
>>> +    DIR *root;
>>> +    int fd;
>>> +    struct pmu_pair ts;
>>> -static unsigned long
>>> -gettime(void)
>>> -{
>>> -    struct timeval t;
>>> -    gettimeofday(&t, NULL);
>>> -    return (t.tv_usec + (t.tv_sec * 1000000));
>>> -}
>>> +    int rapl_fd;
>>> +    double rapl_scale;
>>> -static int
>>> -top_bits_sort(const void *a, const void *b)
>>> +    struct pmu_counter freq_req;
>>> +    struct pmu_counter freq_act;
>>> +    struct pmu_counter irq;
>>> +    struct pmu_counter rc6;
>>> +    struct pmu_counter rapl;
>>> +
>>> +    struct engine engine;
>>> +};
>>> +
>>> +static uint64_t
>>> +get_pmu_config(int dirfd, const char *name, const char *counter)
>>>   {
>>> -    struct top_bit * const *bit_a = a;
>>> -    struct top_bit * const *bit_b = b;
>>> -    int a_count = (*bit_a)->count;
>>> -    int b_count = (*bit_b)->count;
>>> +    char buf[128], *p;
>>> +    int fd, ret;
>>> -    if (a_count < b_count)
>>> -        return 1;
>>> -    else if (a_count == b_count)
>>> -        return 0;
>>> -    else
>>> +    ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
>>> +    if (ret < 0 || ret == sizeof(buf))
>>>           return -1;
>>> -}
>>> -static void
>>> -update_idle_bit(struct top_bit *top_bit)
>>> -{
>>> -    uint32_t reg_val;
>>> +    fd = openat(dirfd, buf, O_RDONLY);
>>> +    if (fd < 0)
>>> +        return -1;
>>> -    if (top_bit->bit->reg == INSTDONE_1)
>>> -        reg_val = instdone1;
>>> -    else
>>> -        reg_val = instdone;
>>> +    ret = read(fd, buf, sizeof(buf));
>>> +    close(fd);
>>> +    if (ret <= 0)
>>> +        return -1;
>>> +
>>> +    p = index(buf, '0');
>>> +    if (!p)
>>> +        return -1;
>>> -    if ((reg_val & top_bit->bit->bit) == 0)
>>> -        top_bit->count++;
>>> +    return strtoul(p, NULL, 0);
>>>   }
>>> -static void
>>> -print_clock(const char *name, int clock) {
>>> -    if (clock == -1)
>>> -        printf("%s clock: unknown", name);
>>> +#define engine_ptr(engines, n) \
>>> +    ((struct engine *)((unsigned char *)(&engines->engine) + (n) * 
>>> sizeof(struct engine)))
>>> +
>>> +static int engine_cmp(const void *__a, const void *__b)
>>> +{
>>> +    const struct engine *a = (struct engine *)__a;
>>> +    const struct engine *b = (struct engine *)__b;
>>> +    int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>>> +              I915_PMU_CLASS_SHIFT;
>>> +    int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>>> +              I915_PMU_CLASS_SHIFT;
>>> +    int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
>>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>>> +    int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
>>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>>> +
>>> +    if (class_a != class_b)
>>> +        return class_a - class_b;
>>>       else
>>> -        printf("%s clock: %d Mhz", name, clock);
>>> +        return instance_a - instance_b;
>>>   }
>>> -static int
>>> -print_clock_info(struct pci_device *pci_dev)
>>> +static struct engines *discover_engines(void)
>>>   {
>>> -    uint32_t devid = pci_dev->device_id;
>>> -    uint16_t gcfgc;
>>> +    const char *sysfs_root = "/sys/devices/i915/events";
>>> +    struct engines *engines;
>>> +    struct dirent *dent;
>>> +    int ret = 0;
>>> +    DIR *d;
>>> -    if (IS_GM45(devid)) {
>>> -        int core_clock = -1;
>>> +    engines = malloc(sizeof(struct engines));
>>> +    if (!engines)
>>> +        return NULL;
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +    memset(engines, 0, sizeof(*engines));
>>> -        switch (gcfgc & 0xf) {
>>> -        case 8:
>>> -            core_clock = 266;
>>> -            break;
>>> -        case 9:
>>> -            core_clock = 320;
>>> -            break;
>>> -        case 11:
>>> -            core_clock = 400;
>>> -            break;
>>> -        case 13:
>>> -            core_clock = 533;
>>> -            break;
>>> -        }
>>> -        print_clock("core", core_clock);
>>> -    } else if (IS_965(devid) && IS_MOBILE(devid)) {
>>> -        int render_clock = -1, sampler_clock = -1;
>>> +    engines->num_engines = 0;
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +    d = opendir(sysfs_root);
>>> +    if (!d)
>>> +        return NULL;
>>> -        switch (gcfgc & 0xf) {
>>> -        case 2:
>>> -            render_clock = 250; sampler_clock = 267;
>>> -            break;
>>> -        case 3:
>>> -            render_clock = 320; sampler_clock = 333;
>>> -            break;
>>> -        case 4:
>>> -            render_clock = 400; sampler_clock = 444;
>>> -            break;
>>> -        case 5:
>>> -            render_clock = 500; sampler_clock = 533;
>>> -            break;
>>> -        }
>>> -
>>> -        print_clock("render", render_clock);
>>> -        printf("  ");
>>> -        print_clock("sampler", sampler_clock);
>>> -    } else if (IS_945(devid) && IS_MOBILE(devid)) {
>>> -        int render_clock = -1, display_clock = -1;
>>> +    while ((dent = readdir(d)) != NULL) {
>>> +        const char *endswith = "-busy";
>>> +        const unsigned int endlen = strlen(endswith);
>>> +        struct engine *engine =
>>> +                engine_ptr(engines, engines->num_engines);
>>> +        char buf[256];
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +        if (dent->d_type != DT_REG)
>>> +            continue;
>>> -        switch (gcfgc & 0x7) {
>>> -        case 0:
>>> -            render_clock = 166;
>>> -            break;
>>> -        case 1:
>>> -            render_clock = 200;
>>> -            break;
>>> -        case 3:
>>> -            render_clock = 250;
>>> -            break;
>>> -        case 5:
>>> -            render_clock = 400;
>>> +        if (strlen(dent->d_name) >= sizeof(buf)) {
>>> +            ret = -1;
>>>               break;
>>>           }
>>> -        switch (gcfgc & 0x70) {
>>> -        case 0:
>>> -            display_clock = 200;
>>> -            break;
>>> -        case 4:
>>> -            display_clock = 320;
>>> -            break;
>>> -        }
>>> -        if (gcfgc & (1 << 7))
>>> -            display_clock = 133;
>>> +        strcpy(buf, dent->d_name);
>>> -        print_clock("render", render_clock);
>>> -        printf("  ");
>>> -        print_clock("display", display_clock);
>>> -    } else if (IS_915(devid) && IS_MOBILE(devid)) {
>>> -        int render_clock = -1, display_clock = -1;
>>> +        /* xxxN-busy */
>>> +        if (strlen(buf) < (endlen + 4))
>>> +            continue;
>>> +        if (strcmp(&buf[strlen(buf) - endlen], endswith))
>>> +            continue;
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +        memset(engine, 0, sizeof(*engine));
>>> -        switch (gcfgc & 0x7) {
>>> -        case 0:
>>> -            render_clock = 160;
>>> -            break;
>>> -        case 1:
>>> -            render_clock = 190;
>>> -            break;
>>> -        case 4:
>>> -            render_clock = 333;
>>> +        buf[strlen(buf) - endlen] = 0;
>>> +        engine->name = strdup(buf);
>>> +        if (!engine->name) {
>>> +            ret = -1;
>>>               break;
>>>           }
>>> -        if (gcfgc & (1 << 13))
>>> -            render_clock = 133;
>>> -        switch (gcfgc & 0x70) {
>>> -        case 0:
>>> -            display_clock = 190;
>>> +        engine->busy.config = get_pmu_config(dirfd(d), engine->name,
>>> +                             "busy");
>>> +        if (engine->busy.config == -1) {
>>> +            ret = -1;
>>>               break;
>>> -        case 4:
>>> -            display_clock = 333;
>>> +        }
>>> +
>>> +        engines->num_engines++;
>>> +        engines = realloc(engines, sizeof(struct engines) +
>>> +                  engines->num_engines * sizeof(struct engine));
>>> +        if (!engines) {
>>> +            ret = -ENOMEM;
>>>               break;
>>>           }
>>> -        if (gcfgc & (1 << 7))
>>> -            display_clock = 133;
>>> +    }
>>> +
>>> +    if (ret)
>>> +        free(engines);
>>> +    else {
>>> +        qsort(engine_ptr(engines, 0), engines->num_engines,
>>> +              sizeof(struct engine), engine_cmp);
>>> -        print_clock("render", render_clock);
>>> -        printf("  ");
>>> -        print_clock("display", display_clock);
>>> +        engines->root = d;
>>>       }
>>> +    return ret == 0 ? engines : NULL;
>>> +}
>>> +
>>> +static int
>>> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
>>> +{
>>> +    int fd;
>>> +    ssize_t ret;
>>> +
>>> +    fd = open(filename, O_RDONLY);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    ret = read(fd, buf, bufsize - 1);
>>> +    close(fd);
>>> +    if (ret < 1)
>>> +        return -1;
>>> +
>>> +    buf[ret] = '\0';
>>> -    printf("\n");
>>> -    return -1;
>>> +    return 0;
>>>   }
>>> -#define STATS_LEN (20)
>>> -#define PERCENTAGE_BAR_END    (79 - STATS_LEN)
>>> +static uint64_t filename_to_u64(const char *filename, int base)
>>> +{
>>> +    char buf[64], *b;
>>> -static void
>>> -print_percentage_bar(float percent, int cur_line_len)
>>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>>> +        return 0;
>>> +
>>> +    /*
>>> +     * Handle both single integer and key=value formats by skipping
>>> +     * leading non-digits.
>>> +     */
>>> +    b = buf;
>>> +    while (*b && !isdigit(*b))
>>> +        b++;
>>> +
>>> +    return strtoull(b, NULL, base);
>>> +}
>>> +
>>> +static uint64_t rapl_type_id(void)
>>>   {
>>> -    int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
>>> -    int bar_len = bar_avail_len * (percent + .5) / 100.0;
>>> -    int i;
>>> +    return filename_to_u64("/sys/devices/power/type", 10);
>>> +}
>>> -    for (i = bar_len; i >= 8; i -= 8) {
>>> -        printf("%s", bars[8]);
>>> -        cur_line_len++;
>>> -    }
>>> -    if (i) {
>>> -        printf("%s", bars[i]);
>>> -        cur_line_len++;
>>> -    }
>>> +static uint64_t rapl_gpu_power(void)
>>> +{
>>> +    return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
>>> +}
>>> +
>>> +static double filename_to_double(const char *filename)
>>> +{
>>> +    char *oldlocale;
>>> +    char buf[80];
>>> +    double v;
>>> +
>>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>>> +        return 0;
>>> -    /* NB: We can't use a field width with utf8 so we manually
>>> -    * guarantee a field with of 45 chars for any bar. */
>>> -    printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
>>> +    oldlocale = setlocale(LC_ALL, "C");
>>> +    v = strtod(buf, NULL);
>>> +    setlocale(LC_ALL, oldlocale);
>>> +
>>> +    return v;
>>>   }
>>> -struct ring {
>>> -    const char *name;
>>> -    uint32_t mmio;
>>> -    int head, tail, size;
>>> -    uint64_t full;
>>> -    int idle;
>>> -};
>>> +static double rapl_gpu_power_scale(void)
>>> +{
>>> +    return 
>>> filename_to_double("/sys/devices/power/events/energy-gpu.scale");
>>> +}
>>> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
>>> +#define __open_pmu(engines, pmu, idx) \
>>> +({ \
>>> +    int fd__; \
>>> +\
>>> +    fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
>>> +    if (fd__ >= 0) { \
>>> +        if ((engines)->fd == -1) \
>>> +            (engines)->fd = fd__; \
>>> +        (pmu)->idx = (idx)++; \
>>> +        (engines)->num_counters++; \
>>> +    } \
>>> +\
>>> +    fd__; \
>>> +})
>>> +
>>> +static int pmu_init(struct engines *engines)
>>>   {
>>> -    return INREG(ring->mmio + reg);
>>> +    unsigned int idx = 0;
>>> +    unsigned int i;
>>> +    int fd;
>>> +
>>> +    engines->fd = -1;
>>> +    engines->num_counters = 0;
>>> +
>>> +    engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
>>> +    fd = __open_pmu(engines, &engines->freq_req, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
>>> +    fd = __open_pmu(engines, &engines->freq_act, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    engines->irq.config = I915_PMU_INTERRUPTS;
>>> +    fd = __open_pmu(engines, &engines->irq, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    engines->rc6.config = I915_PMU_RC6_RESIDENCY;
>>> +    fd = __open_pmu(engines, &engines->rc6, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    for (i = 0; i < engines->num_engines; i++) {
>>> +        struct engine *engine = engine_ptr(engines, i);
>>> +        struct {
>>> +            struct pmu_counter *pmu;
>>> +            const char *counter;
>>> +        } *cnt, counters[] = {
>>> +            { .pmu = &engine->busy, .counter = "busy" },
>>> +            { .pmu = &engine->wait, .counter = "wait" },
>>> +            { .pmu = &engine->sema, .counter = "sema" },
>>> +            { .pmu = NULL, .counter = NULL },
>>> +        };
>>> +
>>> +        for (cnt = counters; cnt->pmu; cnt++) {
>>> +            if (!cnt->pmu->config)
>>> +                cnt->pmu->config =
>>> +                    get_pmu_config(dirfd(engines->root),
>>> +                               engine->name,
>>> +                               cnt->counter);
>>> +            fd = __open_pmu(engines, cnt->pmu, idx);
>>> +            if (fd < 0)
>>> +                return -1;
>>> +        }
>>> +    }
>>> +
>>> +    engines->rapl_scale = rapl_gpu_power_scale();
>>> +    if (engines->rapl_scale != NAN)
>>> +        engines->rapl_scale *= 1e3; /* from nano to micro */
>>> +    engines->rapl.config = rapl_gpu_power();
>>> +    engines->rapl_fd = igt_perf_open(rapl_type_id(), 
>>> engines->rapl.config);
>>> +    if (engines->rapl_fd < 0)
>>> +        return -1;
>>> +
>>> +    return 0;
>>>   }
>>> -static void ring_init(struct ring *ring)
>>> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>>>   {
>>> -    ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 
>>> 12) + 1) * 4096;
>>> +    uint64_t buf[2 + num];
>>> +    unsigned int i;
>>> +    ssize_t len;
>>> +
>>> +    memset(buf, 0, sizeof(buf));
>>> +
>>> +    len = read(fd, buf, sizeof(buf));
>>> +    assert(len == sizeof(buf));
>>> +
>>> +    for (i = 0; i < num; i++)
>>> +        val[i] = buf[2 + i];
>>> +
>>> +    return buf[1];
>>>   }
>>> -static void ring_reset(struct ring *ring)
>>> +static double pmu_calc(struct pmu_pair *p, double d, double t, 
>>> double s)
>>>   {
>>> -    ring->idle = ring->full = 0;
>>> +    double pct;
>>> +
>>> +    pct = p->cur - p->prev;
>>> +    pct /= d;
>>> +    pct /= t;
>>> +    pct *= s;
>>> +
>>> +    if (s == 100.0 && pct > 100.0)
>>> +        pct = 100.0;
>>> +
>>> +    return pct;
>>>   }
>>> -static void ring_sample(struct ring *ring)
>>> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>>>   {
>>> -    int full;
>>> +    uint64_t data[2] = { };
>>> +    ssize_t len;
>>> -    if (!ring->size)
>>> -        return;
>>> +    len = read(fd, data, sizeof(data));
>>> +    assert(len == sizeof(data));
>>> -    ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
>>> -    ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
>>> +    if (ts)
>>> +        *ts = data[1];
>>> +
>>> +    return data[0];
>>> +}
>>> -    if (ring->tail == ring->head)
>>> -        ring->idle++;
>>> +static uint64_t pmu_read_single(int fd)
>>> +{
>>> +    return __pmu_read_single(fd, NULL);
>>> +}
>>> -    full = ring->tail - ring->head;
>>> -    if (full < 0)
>>> -        full += ring->size;
>>> -    ring->full += full;
>>> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>>> +{
>>> +    counter->val.prev = counter->val.cur;
>>> +    counter->val.cur = val;
>>>   }
>>> -static void ring_print_header(FILE *out, struct ring *ring)
>>> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>>>   {
>>> -    fprintf(out, "%.6s%%\tops\t",
>>> -            ring->name
>>> -          );
>>> +    __update_sample(counter, val[counter->idx]);
>>>   }
>>> -static void ring_print(struct ring *ring, unsigned long 
>>> samples_per_sec)
>>> +static void pmu_sample(struct engines *engines)
>>>   {
>>> -    int percent_busy, len;
>>> +    const int num_val = engines->num_counters;
>>> +    uint64_t val[num_val];
>>> +    unsigned int i;
>>> +
>>> +    engines->ts.prev = engines->ts.cur;
>>> +    engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>>> +
>>> +    __update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>>> -    if (!ring->size)
>>> -        return;
>>> +    update_sample(&engines->freq_req, val);
>>> +    update_sample(&engines->freq_act, val);
>>> +    update_sample(&engines->irq, val);
>>> +    update_sample(&engines->rc6, val);
>>> -    percent_busy = 100 - 100 * ring->idle / samples_per_sec;
>>> +    for (i = 0; i < engines->num_engines; i++) {
>>> +        struct engine *engine = engine_ptr(engines, i);
>>> -    len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
>>> -    print_percentage_bar (percent_busy, len);
>>> -    printf("%24s space: %d/%d\n",
>>> -           ring->name,
>>> -           (int)(ring->full / samples_per_sec),
>>> -           ring->size);
>>> +        update_sample(&engine->busy, val);
>>> +        update_sample(&engine->sema, val);
>>> +        update_sample(&engine->wait, val);
>>> +    }
>>>   }
>>> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
>>> -        FILE *output)
>>> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", 
>>> "▉", "█" };
>>> +
>>> +static void
>>> +print_percentage_bar(double percent, int max_len)
>>>   {
>>> -    if (ring->size)
>>> -        fprintf(output, "%3d\t%d\t",
>>> -            (int)(100 - 100 * ring->idle / samples_per_sec),
>>> -            (int)(ring->full / samples_per_sec));
>>> -    else
>>> -        fprintf(output, "-1\t-1\t");
>>> +    int bar_len = percent * (8 * (max_len - 2)) / 100.0;
>>> +    int i;
>>> +
>>> +    putchar('|');
>>> +
>>> +    for (i = bar_len; i >= 8; i -= 8)
>>> +        printf("%s", bars[8]);
>>> +    if (i)
>>> +        printf("%s", bars[i]);
>>> +
>>> +    for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
>>> +        putchar(' ');
>>> +
>>> +    putchar('|');
>>>   }
>>> +#define DEFAULT_PERIOD_MS (1000)
>>> +
>>>   static void
>>>   usage(const char *appname)
>>>   {
>>>       printf("intel_gpu_top - Display a top-like summary of Intel GPU 
>>> usage\n"
>>> -            "\n"
>>> -            "usage: %s [parameters]\n"
>>> -            "\n"
>>> -            "The following parameters apply:\n"
>>> -            "[-s <samples>]       samples per seconds (default %d)\n"
>>> -            "[-e <command>]       command to profile\n"
>>> -            "[-o <file>]          output statistics to file. If file 
>>> is '-',"
>>> -            "                     run in batch mode and output 
>>> statistics to stdio only \n"
>>> -            "[-h]                 show this help screen\n"
>>> -            "\n",
>>> -            appname,
>>> -            SAMPLES_PER_SEC
>>> -          );
>>> -    return;
>>> +        "\n"
>>> +        "Usage: %s [parameters]\n"
>>> +        "\n"
>>> +        "\tThe following parameters are optional:\n"
>>> +        "\t[-s <samples>]       refresh period in ms (default %ums)\n"
>>> +        "\t[-h]                 show this help text\n"
>>> +        "\n",
>>> +        appname, DEFAULT_PERIOD_MS);
>>>   }
>>>   int main(int argc, char **argv)
>>>   {
>>> -    uint32_t devid;
>>> -    struct pci_device *pci_dev;
>>> -    struct ring render_ring = {
>>> -        .name = "render",
>>> -        .mmio = 0x2030,
>>> -    }, bsd_ring = {
>>> -        .name = "bitstream",
>>> -        .mmio = 0x4030,
>>> -    }, bsd6_ring = {
>>> -        .name = "bitstream",
>>> -        .mmio = 0x12030,
>>> -    }, blt_ring = {
>>> -        .name = "blitter",
>>> -        .mmio = 0x22030,
>>> -    };
>>> -    int i, ch;
>>> -    int samples_per_sec = SAMPLES_PER_SEC;
>>> -    FILE *output = NULL;
>>> -    double elapsed_time=0;
>>> -    int print_headers=1;
>>> -    pid_t child_pid=-1;
>>> -    int child_stat;
>>> -    char *cmd=NULL;
>>> -    int interactive=1;
>>> -
>>> -    /* Parse options? */
>>> -    while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
>>> +    unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
>>> +    int con_w = -1, con_h = -1;
>>> +    struct engines *engines;
>>> +    unsigned int i;
>>> +    int ret, ch;
>>> +
>>> +    /* Parse options */
>>> +    while ((ch = getopt(argc, argv, "s:h")) != -1) {
>>>           switch (ch) {
>>> -        case 'e': cmd = strdup(optarg);
>>> -            break;
>>> -        case 's': samples_per_sec = atoi(optarg);
>>> -            if (samples_per_sec < 100) {
>>> -                fprintf(stderr, "Error: samples per second must be 
>>> >= 100\n");
>>> -                exit(1);
>>> -            }
>>> -            break;
>>> -        case 'o':
>>> -            if (!strcmp(optarg, "-")) {
>>> -                /* Running in non-interactive mode */
>>> -                interactive = 0;
>>> -                output = stdout;
>>> -            }
>>> -            else
>>> -                output = fopen(optarg, "w");
>>> -            if (!output)
>>> -            {
>>> -                perror("fopen");
>>> -                exit(1);
>>> -            }
>>> +        case 's':
>>> +            period_us = atoi(optarg) * 1000;
>>>               break;
>>>           case 'h':
>>>               usage(argv[0]);
>>>               exit(0);
>>> -            break;
>>>           default:
>>> -            fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
>>> +            fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>>>               usage(argv[0]);
>>>               exit(1);
>>> -            break;
>>>           }
>>>       }
>>> -    pci_dev = intel_get_pci_device();
>>> -    devid = pci_dev->device_id;
>>> -    intel_mmio_use_pci_bar(pci_dev);
>>> -    init_instdone_definitions(devid);
>>> -
>>> -    /* Do we have a command to run? */
>>> -    if (cmd != NULL) {
>>> -        if (output) {
>>> -            fprintf(output, "# Profiling: %s\n", cmd);
>>> -            fflush(output);
>>> -        }
>>> -        child_pid = fork();
>>> -        if (child_pid < 0) {
>>> -            perror("fork");
>>> -            exit(1);
>>> -        }
>>> -        else if (child_pid == 0) {
>>> -            int res;
>>> -            res = system(cmd);
>>> -            if (res < 0)
>>> -                perror("running command");
>>> -            if (output) {
>>> -                fflush(output);
>>> -                fprintf(output, "# %s exited with status %d\n", cmd, 
>>> res);
>>> -                fflush(output);
>>> -            }
>>> -            free(cmd);
>>> -            exit(0);
>>> -        } else {
>>> -            free(cmd);
>>> -        }
>>> +    engines = discover_engines();
>>> +    if (!engines) {
>>> +        fprintf(stderr, "Failed to detect engines!\n");
>>> +        return 1;
>>>       }
>>> -    for (i = 0; i < num_instdone_bits; i++) {
>>> -        top_bits[i].bit = &instdone_bits[i];
>>> -        top_bits[i].count = 0;
>>> -        top_bits_sorted[i] = &top_bits[i];
>>> +    ret = pmu_init(engines);
>>> +    if (ret) {
>>> +        fprintf(stderr, "Failed to initialize PMU!\n");
>>> +        return 1;
>>>       }
>>> -    /* Grab access to the registers */
>>> -    intel_register_access_init(pci_dev, 0, -1);
>>> +    pmu_sample(engines);
>>> -    ring_init(&render_ring);
>>> -    if (IS_GEN4(devid) || IS_GEN5(devid))
>>> -        ring_init(&bsd_ring);
>>> -    if (IS_GEN6(devid) || IS_GEN7(devid)) {
>>> -        ring_init(&bsd6_ring);
>>> -        ring_init(&blt_ring);
>>> -    }
>>> +    for (;;) {
>>> +        double t, freq[2], irq, rc6, power;
>>> +        struct winsize ws;
>>> +        int lines = 0;
>>> -    /* Initialize GPU stats */
>>> -    if (HAS_STATS_REGS(devid)) {
>>> -        for (i = 0; i < STATS_COUNT; i++) {
>>> -            uint32_t stats_high, stats_low, stats_high_2;
>>> +        /* Update terminal size. */
>>> +        if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>>> +            con_w = ws.ws_col;
>>> +            con_h = ws.ws_row;
>>> +        }
>>> -            do {
>>> -                stats_high = INREG(stats_regs[i] + 4);
>>> -                stats_low = INREG(stats_regs[i]);
>>> -                stats_high_2 = INREG(stats_regs[i] + 4);
>>> -            } while (stats_high != stats_high_2);
>>> +        pmu_sample(engines);
>>> +        t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>>> -            last_stats[i] = (uint64_t)stats_high << 32 |
>>> -                stats_low;
>>> -        }
>>> -    }
>>> +        printf("\033[H\033[J");
>>> -    for (;;) {
>>> -        int j;
>>> -        unsigned long long t1, ti, tf, t2;
>>> -        unsigned long long def_sleep = 1000000 / samples_per_sec;
>>> -        unsigned long long last_samples_per_sec = samples_per_sec;
>>> -        unsigned short int max_lines;
>>> -        struct winsize ws;
>>> -        char clear_screen[] = {0x1b, '[', 'H',
>>> -                       0x1b, '[', 'J',
>>> -                       0x0};
>>> -        int percent;
>>> -        int len;
>>> -
>>> -        t1 = gettime();
>>> -
>>> -        ring_reset(&render_ring);
>>> -        ring_reset(&bsd_ring);
>>> -        ring_reset(&bsd6_ring);
>>> -        ring_reset(&blt_ring);
>>> -
>>> -        for (i = 0; i < samples_per_sec; i++) {
>>> -            long long interval;
>>> -            ti = gettime();
>>> -            if (IS_965(devid)) {
>>> -                instdone = INREG(INSTDONE_I965);
>>> -                instdone1 = INREG(INSTDONE_1);
>>> -            } else
>>> -                instdone = INREG(INSTDONE);
>>> -
>>> -            for (j = 0; j < num_instdone_bits; j++)
>>> -                update_idle_bit(&top_bits[j]);
>>> -
>>> -            ring_sample(&render_ring);
>>> -            ring_sample(&bsd_ring);
>>> -            ring_sample(&bsd6_ring);
>>> -            ring_sample(&blt_ring);
>>> -
>>> -            tf = gettime();
>>> -            if (tf - t1 >= 1000000) {
>>> -                /* We are out of sync, bail out */
>>> -                last_samples_per_sec = i+1;
>>> -                break;
>>> -            }
>>> -            interval = def_sleep - (tf - ti);
>>> -            if (interval > 0)
>>> -                usleep(interval);
>>> -        }
>>> +        freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
>>> +        freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
>>> +        irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
>>> +        rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
>>> +        power = pmu_calc(&engines->rapl.val, 1.0, t,
>>> +                 engines->rapl_scale);
>>> -        if (HAS_STATS_REGS(devid)) {
>>> -            for (i = 0; i < STATS_COUNT; i++) {
>>> -                uint32_t stats_high, stats_low, stats_high_2;
>>> +        printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; 
>>> %6.0fmW; %8.0f irqs/s\n",
>>> +               freq[0], freq[1], rc6, power, irq);
>>> +        lines++;
>>> -                do {
>>> -                    stats_high = INREG(stats_regs[i] + 4);
>>> -                    stats_low = INREG(stats_regs[i]);
>>> -                    stats_high_2 = INREG(stats_regs[i] + 4);
>>> -                } while (stats_high != stats_high_2);
>>> +        printf("\n");
>>> +        lines++;
>>> -                stats[i] = (uint64_t)stats_high << 32 |
>>> -                    stats_low;
>>> -            }
>>> -        }
>>> +        for (i = 0; i < engines->num_engines && lines < con_h; i++) {
>>> +            struct engine *engine = engine_ptr(engines, i);
>>> +            unsigned int max_w = con_w - 1;
>>> +            unsigned int len;
>>> +            double val[2];
>>> +            char buf[128];
>>> -        qsort(top_bits_sorted, num_instdone_bits,
>>> -              sizeof(struct top_bit *), top_bits_sort);
>>> -
>>> -        /* Limit the number of lines printed to the terminal height 
>>> so the
>>> -         * most important info (at the top) will stay on screen. */
>>> -        max_lines = -1;
>>> -        if (ioctl(0, TIOCGWINSZ, &ws) != -1)
>>> -            max_lines = ws.ws_row - 6; /* exclude header lines */
>>> -        if (max_lines >= num_instdone_bits)
>>> -            max_lines = num_instdone_bits;
>>> -
>>> -        t2 = gettime();
>>> -        elapsed_time += (t2 - t1) / 1000000.0;
>>> -
>>> -        if (interactive) {
>>> -            printf("%s", clear_screen);
>>> -            print_clock_info(pci_dev);
>>> -
>>> -            ring_print(&render_ring, last_samples_per_sec);
>>> -            ring_print(&bsd_ring, last_samples_per_sec);
>>> -            ring_print(&bsd6_ring, last_samples_per_sec);
>>> -            ring_print(&blt_ring, last_samples_per_sec);
>>> -
>>> -            printf("\n%30s  %s\n", "task", "percent busy");
>>> -            for (i = 0; i < max_lines; i++) {
>>> -                if (top_bits_sorted[i]->count > 0) {
>>> -                    percent = (top_bits_sorted[i]->count * 100) /
>>> -                        last_samples_per_sec;
>>> -                    len = printf("%30s: %3d%%: ",
>>> -                             top_bits_sorted[i]->bit->name,
>>> -                             percent);
>>> -                    print_percentage_bar (percent, len);
>>> -                } else {
>>> -                    printf("%*s", PERCENTAGE_BAR_END, "");
>>> -                }
>>> -
>>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>>> -                    printf("%13s: %llu (%lld/sec)",
>>> -                           stats_reg_names[i],
>>> -                           (long long)stats[i],
>>> -                           (long long)(stats[i] - last_stats[i]));
>>> -                    last_stats[i] = stats[i];
>>> -                } else {
>>> -                    if (!top_bits_sorted[i]->count)
>>> -                        break;
>>> -                }
>>> -                printf("\n");
>>> -            }
>>> -        }
>>> -        if (output) {
>>> -            /* Print headers for columns at first run */
>>> -            if (print_headers) {
>>> -                fprintf(output, "# time\t");
>>> -                ring_print_header(output, &render_ring);
>>> -                ring_print_header(output, &bsd_ring);
>>> -                ring_print_header(output, &bsd6_ring);
>>> -                ring_print_header(output, &blt_ring);
>>> -                for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>>> -                    if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>>> -                        fprintf(output, "%.6s\t",
>>> -                               stats_reg_names[i]
>>> -                               );
>>> -                    }
>>> -                    if (!top_bits[i].count)
>>> -                        continue;
>>> -                }
>>> -                fprintf(output, "\n");
>>> -                print_headers = 0;
>>> -            }
>>> -
>>> -            /* Print statistics */
>>> -            fprintf(output, "%.2f\t", elapsed_time);
>>> -            ring_log(&render_ring, last_samples_per_sec, output);
>>> -            ring_log(&bsd_ring, last_samples_per_sec, output);
>>> -            ring_log(&bsd6_ring, last_samples_per_sec, output);
>>> -            ring_log(&blt_ring, last_samples_per_sec, output);
>>> -
>>> -            for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>>> -                    fprintf(output, "%"PRIu64"\t",
>>> -                           stats[i] - last_stats[i]);
>>> -                    last_stats[i] = stats[i];
>>> -                }
>>> -                    if (!top_bits[i].count)
>>> -                        continue;
>>> -            }
>>> -            fprintf(output, "\n");
>>> -            fflush(output);
>>> -        }
>>> +            val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
>>> +            val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
>>> +            len = snprintf(buf, sizeof(buf),
>>> +                       "%6.2f%% wait, %6.2f%% sema",
>>> +                       val[0], val[1]);
>>> -        for (i = 0; i < num_instdone_bits; i++) {
>>> -            top_bits_sorted[i]->count = 0;
>>> +            val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
>>> +            len += printf("%8s %6.2f%% ",
>>> +                      engine->name, val[0]);
>>> +            print_percentage_bar(val[0], max_w - len);
>>> -            if (i < STATS_COUNT)
>>> -                last_stats[i] = stats[i];
>>> -        }
>>> +            printf("%s\n", buf);
>>> -        /* Check if child has gone */
>>> -        if (child_pid > 0) {
>>> -            int res;
>>> -            if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == 
>>> -1) {
>>> -                perror("waitpid");
>>> -                exit(1);
>>> -            }
>>> -            if (res == 0)
>>> -                continue;
>>> -            if (WIFEXITED(child_stat))
>>> -                break;
>>> +            lines++;
>>>           }
>>> -    }
>>> -    fclose(output);
>>> +        printf("\n");
>>> +
>>> +        usleep(period_us);
>>> +    }
>>> -    intel_register_access_fini();
>>>       return 0;
>>>   }
>>> diff --git a/tools/meson.build b/tools/meson.build
>>> index bd2d313d5156..a918eeb0bef1 100644
>>> --- a/tools/meson.build
>>> +++ b/tools/meson.build
>>> @@ -23,7 +23,6 @@ tools_progs = [
>>>       'intel_gpu_frequency',
>>>       'intel_firmware_decode',
>>>       'intel_gpu_time',
>>> -    'intel_gpu_top',
>>>       'intel_gtt',
>>>       'intel_guc_logger',
>>>       'intel_infoframes',
>>> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>>>              name_prefix : '',
>>>              install : true)
>>> +executable('intel_gpu_top', 'intel_gpu_top.c',
>>> +       install : true,
>>> +       install_rpath : rpathdir,
>>> +       dependencies : tool_deps + [ lib_igt_perf ])
>>> +
>>>   conf_data = configuration_data()
>>>   conf_data.set('prefix', prefix)
>>>   conf_data.set('exec_prefix', '${prefix}')
>>>
>>
>> _______________________________________________
>> igt-dev mailing list
>> igt-dev@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/igt-dev

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-03 14:06         ` Eero Tamminen
  0 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-04-03 14:06 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Hi,

On 03.04.2018 12:36, Tvrtko Ursulin wrote:
> On 29/03/2018 15:30, Eero Tamminen wrote:
>> I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
>> with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
>> and latest drm-tip build (4.16.0-rc7).
>>
>>
>> General comments
>> ----------------
>>
>> This will be used by our customers and people who aren't necessarily
>> familiar with i915 internal details.  Therefore it should use
>> common terminology in the field and in similar tools, instead of
>> I3As (Intel 3-letter Acronyms).
>>
>> For example:
>>   - rcs -> 3D render
>>   - bcs -> blitter
>>   - vecs -> video
>>   - vcs -> video decode
>> etc.
> 
> Done. And I am open to bike-shedding of the names and display format for 
> instance reporting.

New names look fine to me!


>> Old tool showed also GPU system memory interface (GAM) busyness.
>> That was valuable info, and reasonably accurate for stable loads.
>>
>> Could this tool show also either that information (preferred), or
>> bandwidth utilized by GPU/CPU/display?
>>
>> (Latest kernels offer GPU memory bandwidth usage through perf
>> "uncore_imc" "data_reads" & "date_writes" counters.)
> 
> Excellent suggestion and I've added IMC data_reads and data_writes to 
> the tool.

Thanks, it looks fine too.  I'm just wondering about the numbers
it's reporting on SKL GT2...

AFAIK IMC counters are for uncore, so I though that they should
correspond to GTI (memory interface to outside of GPU) read and
write HW counter values.  While it seemed in some cases quite close,
in some cases the it showed a lot smaller (2/3) value than expected.

I can understand why reads are sometimes larger, because I think
uncore will include also display engine display content reads.

However, I don't see how uncore writes could be considerably smaller
than the GTI interface write amount.

(GTI interface reports the expected value which corresponds directly
to what my test application is doing (64x blended FullHD layer writes).)

Idle machine read amounts are also much smaller (60-65MB/s) than what
I think display update read should be (1920*1080*4*60Hz = 475MiB/s).

Any ideas for these two discrepancies?


>> Is "wait" value supposed to be IO-wait for given engine interface?
>>
>> I never saw that change from 0%, although IO-wait in top jumped
>> from 0 to 20-30% with my test GPU load.
> 
> No, that is time spent in MI_WAIT_FOR_EVENT.

Could you add that info to the UI?

E.g. just have "MI" on top of the "wait" column.


 > I think not very used in current codebase.

What you're using to validate that it reports correct value?


>> HW specific test results
>> ------------------------
>>
>> BYT:
>> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>>    works fine.
>>
>> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
>> for the "wait" value.
>>
>> I never saw blitter engine to do anything, but that's because
>> modesetting uses just 3D pipeline, and because I couldn't get
>> Intel DDX to work with rest of latest git version of X / 3D stack.
> 
> Thank you for testing this so thoroughly - this was really invaluable 
> since I don't have access too such number of platforms. I've tried to 
> fix all this in the latest version.

Machines are currently running tests, I'll check these tomorrow.


>> Kernel version support
>> ----------------------
>>
>> My HW specific testing above was with drm-tip kernel, but I did one test
>> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
>> backport) on KBL.  For that, the tool reported:
>> "Failed to detect engines!"
>>
>> Although the previous intel_gpu_top works fine with that kernel version.
>>
>> Same happens also with Ubuntu 17.04 v4.13 kernel.
>>
>>
>> -> If new version needs a certain kernel version, it should tell
>>     which version is required.
> 
> Yep, at least 4.16 is needed so I have added this info to the error 
> message.

IMHO the message is a bit ambivalent:
	Failed to detect engines! Kernel 4.16 or newer?

I would suggest checking whether kernel is new enough, and if not:
	Kernel X.YY detected, 4.16 or newer required.


	- Eero

> Thanks again for testing it and when you find the time if you could do 
> it once more with the latest version (on the problematic platforms) that 
> would be much appreciated.
> 
> Regards,
> 
> Tvrtko
> 
>>
>>
>>      - Eero
>>
>> On 29.03.2018 13:33, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>> intel-gpu-top is a dangerous tool which can hang machines due unsafe 
>>> mmio
>>> register access. This patch rewrites it to use only PMU.
>>>
>>> Only overall command streamer busyness and GPU global data such as power
>>> and frequencies are included in this new version.
>>>
>>> For access to more GPU functional unit level data, an OA metric based 
>>> tool
>>> like gpu-top should be used instead.
>>>
>>> v2:
>>>   * Sort engines by class and instance.
>>>   * Do not wait for one sampling period to display something on screen.
>>>   * Move code out of the asserts. (Rinat Ibragimov)
>>>   * Continuously adapt to terminal size. (Rinat Ibgragimov)
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>>> Cc: Petri Latvala <petri.latvala@intel.com>
>>> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
>>> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
>>> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
>>> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
>>> ---
>>>   tools/Makefile.am     |    2 +
>>>   tools/intel_gpu_top.c | 1009 
>>> +++++++++++++++++++++----------------------------
>>>   tools/meson.build     |    6 +-
>>>   3 files changed, 441 insertions(+), 576 deletions(-)
>>>
>>> diff --git a/tools/Makefile.am b/tools/Makefile.am
>>> index 09b6dbcc3ece..a0b016ddd7ff 100644
>>> --- a/tools/Makefile.am
>>> +++ b/tools/Makefile.am
>>> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version 
>>> -no-undefined
>>>   intel_aubdump_la_SOURCES = aubdump.c
>>>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>>> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
>>> +
>>>   bin_SCRIPTS = intel_aubdump
>>>   CLEANFILES = $(bin_SCRIPTS)
>>> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
>>> index 098e6ce3ff86..94091d97c4a3 100644
>>> --- a/tools/intel_gpu_top.c
>>> +++ b/tools/intel_gpu_top.c
>>> @@ -1,6 +1,5 @@
>>>   /*
>>> - * Copyright © 2007 Intel Corporation
>>> - * Copyright © 2011 Intel Corporation
>>> + * Copyright © 2018 Intel Corporation
>>>    *
>>>    * Permission is hereby granted, free of charge, to any person 
>>> obtaining a
>>>    * copy of this software and associated documentation files (the 
>>> "Software"),
>>> @@ -18,701 +17,561 @@
>>>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO 
>>> EVENT SHALL
>>>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
>>> DAMAGES OR OTHER
>>>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>>> ARISING
>>> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>>> - * DEALINGS IN THE SOFTWARE.
>>> - *
>>> - * Authors:
>>> - *    Eric Anholt <eric@anholt.net>
>>> - *    Eugeni Dodonov <eugeni.dodonov@intel.com>
>>> - *
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
>>> OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>>    */
>>> -#include "config.h"
>>> -
>>> -#include <inttypes.h>
>>> -#include <unistd.h>
>>> -#include <stdlib.h>
>>>   #include <stdio.h>
>>> -#include <err.h>
>>> -#include <sys/ioctl.h>
>>> -#include <sys/time.h>
>>> -#include <sys/wait.h>
>>> +#include <sys/types.h>
>>> +#include <dirent.h>
>>> +#include <stdint.h>
>>> +#include <assert.h>
>>>   #include <string.h>
>>> -#ifdef HAVE_TERMIOS_H
>>> -#include <termios.h>
>>> -#endif
>>> -#include "intel_io.h"
>>> -#include "instdone.h"
>>> -#include "intel_reg.h"
>>> -#include "intel_chipset.h"
>>> -#include "drmtest.h"
>>> -
>>> -#define  FORCEWAKE        0xA18C
>>> -#define  FORCEWAKE_ACK        0x130090
>>> -
>>> -#define SAMPLES_PER_SEC             10000
>>> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
>>> -
>>> -#define MAX_NUM_TOP_BITS            100
>>> -
>>> -#define HAS_STATS_REGS(devid)        IS_965(devid)
>>> -
>>> -struct top_bit {
>>> -    struct instdone_bit *bit;
>>> -    int count;
>>> -} top_bits[MAX_NUM_TOP_BITS];
>>> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
>>> -
>>> -static uint32_t instdone, instdone1;
>>> -
>>> -static const char *bars[] = {
>>> -    " ",
>>> -    "▏",
>>> -    "▎",
>>> -    "▍",
>>> -    "▌",
>>> -    "▋",
>>> -    "▊",
>>> -    "▉",
>>> -    "█"
>>> -};
>>> +#include <ctype.h>
>>> +#include <stdlib.h>
>>> +#include <unistd.h>
>>> +#include <sys/stat.h>
>>> +#include <fcntl.h>
>>> +#include <inttypes.h>
>>> +#include <sys/ioctl.h>
>>> +#include <errno.h>
>>> +#include <math.h>
>>> +#include <locale.h>
>>> +
>>> +#include "igt_perf.h"
>>> -enum stats_counts {
>>> -    IA_VERTICES,
>>> -    IA_PRIMITIVES,
>>> -    VS_INVOCATION,
>>> -    GS_INVOCATION,
>>> -    GS_PRIMITIVES,
>>> -    CL_INVOCATION,
>>> -    CL_PRIMITIVES,
>>> -    PS_INVOCATION,
>>> -    PS_DEPTH,
>>> -    STATS_COUNT
>>> +struct pmu_pair {
>>> +    uint64_t cur;
>>> +    uint64_t prev;
>>>   };
>>> -const uint32_t stats_regs[STATS_COUNT] = {
>>> -    IA_VERTICES_COUNT_QW,
>>> -    IA_PRIMITIVES_COUNT_QW,
>>> -    VS_INVOCATION_COUNT_QW,
>>> -    GS_INVOCATION_COUNT_QW,
>>> -    GS_PRIMITIVES_COUNT_QW,
>>> -    CL_INVOCATION_COUNT_QW,
>>> -    CL_PRIMITIVES_COUNT_QW,
>>> -    PS_INVOCATION_COUNT_QW,
>>> -    PS_DEPTH_COUNT_QW,
>>> +struct pmu_counter {
>>> +    uint64_t config;
>>> +    unsigned int idx;
>>> +    struct pmu_pair val;
>>>   };
>>> -const char *stats_reg_names[STATS_COUNT] = {
>>> -    "vert fetch",
>>> -    "prim fetch",
>>> -    "VS invocations",
>>> -    "GS invocations",
>>> -    "GS prims",
>>> -    "CL invocations",
>>> -    "CL prims",
>>> -    "PS invocations",
>>> -    "PS depth pass",
>>> +struct engine {
>>> +    const char *name;
>>> +    struct pmu_counter busy;
>>> +    struct pmu_counter wait;
>>> +    struct pmu_counter sema;
>>>   };
>>> -uint64_t stats[STATS_COUNT];
>>> -uint64_t last_stats[STATS_COUNT];
>>> +struct engines {
>>> +    unsigned int num_engines;
>>> +    unsigned int num_counters;
>>> +    DIR *root;
>>> +    int fd;
>>> +    struct pmu_pair ts;
>>> -static unsigned long
>>> -gettime(void)
>>> -{
>>> -    struct timeval t;
>>> -    gettimeofday(&t, NULL);
>>> -    return (t.tv_usec + (t.tv_sec * 1000000));
>>> -}
>>> +    int rapl_fd;
>>> +    double rapl_scale;
>>> -static int
>>> -top_bits_sort(const void *a, const void *b)
>>> +    struct pmu_counter freq_req;
>>> +    struct pmu_counter freq_act;
>>> +    struct pmu_counter irq;
>>> +    struct pmu_counter rc6;
>>> +    struct pmu_counter rapl;
>>> +
>>> +    struct engine engine;
>>> +};
>>> +
>>> +static uint64_t
>>> +get_pmu_config(int dirfd, const char *name, const char *counter)
>>>   {
>>> -    struct top_bit * const *bit_a = a;
>>> -    struct top_bit * const *bit_b = b;
>>> -    int a_count = (*bit_a)->count;
>>> -    int b_count = (*bit_b)->count;
>>> +    char buf[128], *p;
>>> +    int fd, ret;
>>> -    if (a_count < b_count)
>>> -        return 1;
>>> -    else if (a_count == b_count)
>>> -        return 0;
>>> -    else
>>> +    ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
>>> +    if (ret < 0 || ret == sizeof(buf))
>>>           return -1;
>>> -}
>>> -static void
>>> -update_idle_bit(struct top_bit *top_bit)
>>> -{
>>> -    uint32_t reg_val;
>>> +    fd = openat(dirfd, buf, O_RDONLY);
>>> +    if (fd < 0)
>>> +        return -1;
>>> -    if (top_bit->bit->reg == INSTDONE_1)
>>> -        reg_val = instdone1;
>>> -    else
>>> -        reg_val = instdone;
>>> +    ret = read(fd, buf, sizeof(buf));
>>> +    close(fd);
>>> +    if (ret <= 0)
>>> +        return -1;
>>> +
>>> +    p = index(buf, '0');
>>> +    if (!p)
>>> +        return -1;
>>> -    if ((reg_val & top_bit->bit->bit) == 0)
>>> -        top_bit->count++;
>>> +    return strtoul(p, NULL, 0);
>>>   }
>>> -static void
>>> -print_clock(const char *name, int clock) {
>>> -    if (clock == -1)
>>> -        printf("%s clock: unknown", name);
>>> +#define engine_ptr(engines, n) \
>>> +    ((struct engine *)((unsigned char *)(&engines->engine) + (n) * 
>>> sizeof(struct engine)))
>>> +
>>> +static int engine_cmp(const void *__a, const void *__b)
>>> +{
>>> +    const struct engine *a = (struct engine *)__a;
>>> +    const struct engine *b = (struct engine *)__b;
>>> +    int class_a = (a->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>>> +              I915_PMU_CLASS_SHIFT;
>>> +    int class_b = (b->busy.config & (__I915_PMU_OTHER(0) - 1)) >>
>>> +              I915_PMU_CLASS_SHIFT;
>>> +    int instance_a = (a->busy.config >> I915_PMU_SAMPLE_BITS) &
>>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>>> +    int instance_b = (b->busy.config >> I915_PMU_SAMPLE_BITS) &
>>> +             ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>>> +
>>> +    if (class_a != class_b)
>>> +        return class_a - class_b;
>>>       else
>>> -        printf("%s clock: %d Mhz", name, clock);
>>> +        return instance_a - instance_b;
>>>   }
>>> -static int
>>> -print_clock_info(struct pci_device *pci_dev)
>>> +static struct engines *discover_engines(void)
>>>   {
>>> -    uint32_t devid = pci_dev->device_id;
>>> -    uint16_t gcfgc;
>>> +    const char *sysfs_root = "/sys/devices/i915/events";
>>> +    struct engines *engines;
>>> +    struct dirent *dent;
>>> +    int ret = 0;
>>> +    DIR *d;
>>> -    if (IS_GM45(devid)) {
>>> -        int core_clock = -1;
>>> +    engines = malloc(sizeof(struct engines));
>>> +    if (!engines)
>>> +        return NULL;
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +    memset(engines, 0, sizeof(*engines));
>>> -        switch (gcfgc & 0xf) {
>>> -        case 8:
>>> -            core_clock = 266;
>>> -            break;
>>> -        case 9:
>>> -            core_clock = 320;
>>> -            break;
>>> -        case 11:
>>> -            core_clock = 400;
>>> -            break;
>>> -        case 13:
>>> -            core_clock = 533;
>>> -            break;
>>> -        }
>>> -        print_clock("core", core_clock);
>>> -    } else if (IS_965(devid) && IS_MOBILE(devid)) {
>>> -        int render_clock = -1, sampler_clock = -1;
>>> +    engines->num_engines = 0;
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +    d = opendir(sysfs_root);
>>> +    if (!d)
>>> +        return NULL;
>>> -        switch (gcfgc & 0xf) {
>>> -        case 2:
>>> -            render_clock = 250; sampler_clock = 267;
>>> -            break;
>>> -        case 3:
>>> -            render_clock = 320; sampler_clock = 333;
>>> -            break;
>>> -        case 4:
>>> -            render_clock = 400; sampler_clock = 444;
>>> -            break;
>>> -        case 5:
>>> -            render_clock = 500; sampler_clock = 533;
>>> -            break;
>>> -        }
>>> -
>>> -        print_clock("render", render_clock);
>>> -        printf("  ");
>>> -        print_clock("sampler", sampler_clock);
>>> -    } else if (IS_945(devid) && IS_MOBILE(devid)) {
>>> -        int render_clock = -1, display_clock = -1;
>>> +    while ((dent = readdir(d)) != NULL) {
>>> +        const char *endswith = "-busy";
>>> +        const unsigned int endlen = strlen(endswith);
>>> +        struct engine *engine =
>>> +                engine_ptr(engines, engines->num_engines);
>>> +        char buf[256];
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +        if (dent->d_type != DT_REG)
>>> +            continue;
>>> -        switch (gcfgc & 0x7) {
>>> -        case 0:
>>> -            render_clock = 166;
>>> -            break;
>>> -        case 1:
>>> -            render_clock = 200;
>>> -            break;
>>> -        case 3:
>>> -            render_clock = 250;
>>> -            break;
>>> -        case 5:
>>> -            render_clock = 400;
>>> +        if (strlen(dent->d_name) >= sizeof(buf)) {
>>> +            ret = -1;
>>>               break;
>>>           }
>>> -        switch (gcfgc & 0x70) {
>>> -        case 0:
>>> -            display_clock = 200;
>>> -            break;
>>> -        case 4:
>>> -            display_clock = 320;
>>> -            break;
>>> -        }
>>> -        if (gcfgc & (1 << 7))
>>> -            display_clock = 133;
>>> +        strcpy(buf, dent->d_name);
>>> -        print_clock("render", render_clock);
>>> -        printf("  ");
>>> -        print_clock("display", display_clock);
>>> -    } else if (IS_915(devid) && IS_MOBILE(devid)) {
>>> -        int render_clock = -1, display_clock = -1;
>>> +        /* xxxN-busy */
>>> +        if (strlen(buf) < (endlen + 4))
>>> +            continue;
>>> +        if (strcmp(&buf[strlen(buf) - endlen], endswith))
>>> +            continue;
>>> -        pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>>> +        memset(engine, 0, sizeof(*engine));
>>> -        switch (gcfgc & 0x7) {
>>> -        case 0:
>>> -            render_clock = 160;
>>> -            break;
>>> -        case 1:
>>> -            render_clock = 190;
>>> -            break;
>>> -        case 4:
>>> -            render_clock = 333;
>>> +        buf[strlen(buf) - endlen] = 0;
>>> +        engine->name = strdup(buf);
>>> +        if (!engine->name) {
>>> +            ret = -1;
>>>               break;
>>>           }
>>> -        if (gcfgc & (1 << 13))
>>> -            render_clock = 133;
>>> -        switch (gcfgc & 0x70) {
>>> -        case 0:
>>> -            display_clock = 190;
>>> +        engine->busy.config = get_pmu_config(dirfd(d), engine->name,
>>> +                             "busy");
>>> +        if (engine->busy.config == -1) {
>>> +            ret = -1;
>>>               break;
>>> -        case 4:
>>> -            display_clock = 333;
>>> +        }
>>> +
>>> +        engines->num_engines++;
>>> +        engines = realloc(engines, sizeof(struct engines) +
>>> +                  engines->num_engines * sizeof(struct engine));
>>> +        if (!engines) {
>>> +            ret = -ENOMEM;
>>>               break;
>>>           }
>>> -        if (gcfgc & (1 << 7))
>>> -            display_clock = 133;
>>> +    }
>>> +
>>> +    if (ret)
>>> +        free(engines);
>>> +    else {
>>> +        qsort(engine_ptr(engines, 0), engines->num_engines,
>>> +              sizeof(struct engine), engine_cmp);
>>> -        print_clock("render", render_clock);
>>> -        printf("  ");
>>> -        print_clock("display", display_clock);
>>> +        engines->root = d;
>>>       }
>>> +    return ret == 0 ? engines : NULL;
>>> +}
>>> +
>>> +static int
>>> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
>>> +{
>>> +    int fd;
>>> +    ssize_t ret;
>>> +
>>> +    fd = open(filename, O_RDONLY);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    ret = read(fd, buf, bufsize - 1);
>>> +    close(fd);
>>> +    if (ret < 1)
>>> +        return -1;
>>> +
>>> +    buf[ret] = '\0';
>>> -    printf("\n");
>>> -    return -1;
>>> +    return 0;
>>>   }
>>> -#define STATS_LEN (20)
>>> -#define PERCENTAGE_BAR_END    (79 - STATS_LEN)
>>> +static uint64_t filename_to_u64(const char *filename, int base)
>>> +{
>>> +    char buf[64], *b;
>>> -static void
>>> -print_percentage_bar(float percent, int cur_line_len)
>>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>>> +        return 0;
>>> +
>>> +    /*
>>> +     * Handle both single integer and key=value formats by skipping
>>> +     * leading non-digits.
>>> +     */
>>> +    b = buf;
>>> +    while (*b && !isdigit(*b))
>>> +        b++;
>>> +
>>> +    return strtoull(b, NULL, base);
>>> +}
>>> +
>>> +static uint64_t rapl_type_id(void)
>>>   {
>>> -    int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
>>> -    int bar_len = bar_avail_len * (percent + .5) / 100.0;
>>> -    int i;
>>> +    return filename_to_u64("/sys/devices/power/type", 10);
>>> +}
>>> -    for (i = bar_len; i >= 8; i -= 8) {
>>> -        printf("%s", bars[8]);
>>> -        cur_line_len++;
>>> -    }
>>> -    if (i) {
>>> -        printf("%s", bars[i]);
>>> -        cur_line_len++;
>>> -    }
>>> +static uint64_t rapl_gpu_power(void)
>>> +{
>>> +    return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
>>> +}
>>> +
>>> +static double filename_to_double(const char *filename)
>>> +{
>>> +    char *oldlocale;
>>> +    char buf[80];
>>> +    double v;
>>> +
>>> +    if (filename_to_buf(filename, buf, sizeof(buf)))
>>> +        return 0;
>>> -    /* NB: We can't use a field width with utf8 so we manually
>>> -    * guarantee a field with of 45 chars for any bar. */
>>> -    printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
>>> +    oldlocale = setlocale(LC_ALL, "C");
>>> +    v = strtod(buf, NULL);
>>> +    setlocale(LC_ALL, oldlocale);
>>> +
>>> +    return v;
>>>   }
>>> -struct ring {
>>> -    const char *name;
>>> -    uint32_t mmio;
>>> -    int head, tail, size;
>>> -    uint64_t full;
>>> -    int idle;
>>> -};
>>> +static double rapl_gpu_power_scale(void)
>>> +{
>>> +    return 
>>> filename_to_double("/sys/devices/power/events/energy-gpu.scale");
>>> +}
>>> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
>>> +#define __open_pmu(engines, pmu, idx) \
>>> +({ \
>>> +    int fd__; \
>>> +\
>>> +    fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
>>> +    if (fd__ >= 0) { \
>>> +        if ((engines)->fd == -1) \
>>> +            (engines)->fd = fd__; \
>>> +        (pmu)->idx = (idx)++; \
>>> +        (engines)->num_counters++; \
>>> +    } \
>>> +\
>>> +    fd__; \
>>> +})
>>> +
>>> +static int pmu_init(struct engines *engines)
>>>   {
>>> -    return INREG(ring->mmio + reg);
>>> +    unsigned int idx = 0;
>>> +    unsigned int i;
>>> +    int fd;
>>> +
>>> +    engines->fd = -1;
>>> +    engines->num_counters = 0;
>>> +
>>> +    engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
>>> +    fd = __open_pmu(engines, &engines->freq_req, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
>>> +    fd = __open_pmu(engines, &engines->freq_act, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    engines->irq.config = I915_PMU_INTERRUPTS;
>>> +    fd = __open_pmu(engines, &engines->irq, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    engines->rc6.config = I915_PMU_RC6_RESIDENCY;
>>> +    fd = __open_pmu(engines, &engines->rc6, idx);
>>> +    if (fd < 0)
>>> +        return -1;
>>> +
>>> +    for (i = 0; i < engines->num_engines; i++) {
>>> +        struct engine *engine = engine_ptr(engines, i);
>>> +        struct {
>>> +            struct pmu_counter *pmu;
>>> +            const char *counter;
>>> +        } *cnt, counters[] = {
>>> +            { .pmu = &engine->busy, .counter = "busy" },
>>> +            { .pmu = &engine->wait, .counter = "wait" },
>>> +            { .pmu = &engine->sema, .counter = "sema" },
>>> +            { .pmu = NULL, .counter = NULL },
>>> +        };
>>> +
>>> +        for (cnt = counters; cnt->pmu; cnt++) {
>>> +            if (!cnt->pmu->config)
>>> +                cnt->pmu->config =
>>> +                    get_pmu_config(dirfd(engines->root),
>>> +                               engine->name,
>>> +                               cnt->counter);
>>> +            fd = __open_pmu(engines, cnt->pmu, idx);
>>> +            if (fd < 0)
>>> +                return -1;
>>> +        }
>>> +    }
>>> +
>>> +    engines->rapl_scale = rapl_gpu_power_scale();
>>> +    if (engines->rapl_scale != NAN)
>>> +        engines->rapl_scale *= 1e3; /* from nano to micro */
>>> +    engines->rapl.config = rapl_gpu_power();
>>> +    engines->rapl_fd = igt_perf_open(rapl_type_id(), 
>>> engines->rapl.config);
>>> +    if (engines->rapl_fd < 0)
>>> +        return -1;
>>> +
>>> +    return 0;
>>>   }
>>> -static void ring_init(struct ring *ring)
>>> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>>>   {
>>> -    ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 
>>> 12) + 1) * 4096;
>>> +    uint64_t buf[2 + num];
>>> +    unsigned int i;
>>> +    ssize_t len;
>>> +
>>> +    memset(buf, 0, sizeof(buf));
>>> +
>>> +    len = read(fd, buf, sizeof(buf));
>>> +    assert(len == sizeof(buf));
>>> +
>>> +    for (i = 0; i < num; i++)
>>> +        val[i] = buf[2 + i];
>>> +
>>> +    return buf[1];
>>>   }
>>> -static void ring_reset(struct ring *ring)
>>> +static double pmu_calc(struct pmu_pair *p, double d, double t, 
>>> double s)
>>>   {
>>> -    ring->idle = ring->full = 0;
>>> +    double pct;
>>> +
>>> +    pct = p->cur - p->prev;
>>> +    pct /= d;
>>> +    pct /= t;
>>> +    pct *= s;
>>> +
>>> +    if (s == 100.0 && pct > 100.0)
>>> +        pct = 100.0;
>>> +
>>> +    return pct;
>>>   }
>>> -static void ring_sample(struct ring *ring)
>>> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>>>   {
>>> -    int full;
>>> +    uint64_t data[2] = { };
>>> +    ssize_t len;
>>> -    if (!ring->size)
>>> -        return;
>>> +    len = read(fd, data, sizeof(data));
>>> +    assert(len == sizeof(data));
>>> -    ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
>>> -    ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
>>> +    if (ts)
>>> +        *ts = data[1];
>>> +
>>> +    return data[0];
>>> +}
>>> -    if (ring->tail == ring->head)
>>> -        ring->idle++;
>>> +static uint64_t pmu_read_single(int fd)
>>> +{
>>> +    return __pmu_read_single(fd, NULL);
>>> +}
>>> -    full = ring->tail - ring->head;
>>> -    if (full < 0)
>>> -        full += ring->size;
>>> -    ring->full += full;
>>> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>>> +{
>>> +    counter->val.prev = counter->val.cur;
>>> +    counter->val.cur = val;
>>>   }
>>> -static void ring_print_header(FILE *out, struct ring *ring)
>>> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>>>   {
>>> -    fprintf(out, "%.6s%%\tops\t",
>>> -            ring->name
>>> -          );
>>> +    __update_sample(counter, val[counter->idx]);
>>>   }
>>> -static void ring_print(struct ring *ring, unsigned long 
>>> samples_per_sec)
>>> +static void pmu_sample(struct engines *engines)
>>>   {
>>> -    int percent_busy, len;
>>> +    const int num_val = engines->num_counters;
>>> +    uint64_t val[num_val];
>>> +    unsigned int i;
>>> +
>>> +    engines->ts.prev = engines->ts.cur;
>>> +    engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>>> +
>>> +    __update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>>> -    if (!ring->size)
>>> -        return;
>>> +    update_sample(&engines->freq_req, val);
>>> +    update_sample(&engines->freq_act, val);
>>> +    update_sample(&engines->irq, val);
>>> +    update_sample(&engines->rc6, val);
>>> -    percent_busy = 100 - 100 * ring->idle / samples_per_sec;
>>> +    for (i = 0; i < engines->num_engines; i++) {
>>> +        struct engine *engine = engine_ptr(engines, i);
>>> -    len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
>>> -    print_percentage_bar (percent_busy, len);
>>> -    printf("%24s space: %d/%d\n",
>>> -           ring->name,
>>> -           (int)(ring->full / samples_per_sec),
>>> -           ring->size);
>>> +        update_sample(&engine->busy, val);
>>> +        update_sample(&engine->sema, val);
>>> +        update_sample(&engine->wait, val);
>>> +    }
>>>   }
>>> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
>>> -        FILE *output)
>>> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", 
>>> "▉", "█" };
>>> +
>>> +static void
>>> +print_percentage_bar(double percent, int max_len)
>>>   {
>>> -    if (ring->size)
>>> -        fprintf(output, "%3d\t%d\t",
>>> -            (int)(100 - 100 * ring->idle / samples_per_sec),
>>> -            (int)(ring->full / samples_per_sec));
>>> -    else
>>> -        fprintf(output, "-1\t-1\t");
>>> +    int bar_len = percent * (8 * (max_len - 2)) / 100.0;
>>> +    int i;
>>> +
>>> +    putchar('|');
>>> +
>>> +    for (i = bar_len; i >= 8; i -= 8)
>>> +        printf("%s", bars[8]);
>>> +    if (i)
>>> +        printf("%s", bars[i]);
>>> +
>>> +    for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
>>> +        putchar(' ');
>>> +
>>> +    putchar('|');
>>>   }
>>> +#define DEFAULT_PERIOD_MS (1000)
>>> +
>>>   static void
>>>   usage(const char *appname)
>>>   {
>>>       printf("intel_gpu_top - Display a top-like summary of Intel GPU 
>>> usage\n"
>>> -            "\n"
>>> -            "usage: %s [parameters]\n"
>>> -            "\n"
>>> -            "The following parameters apply:\n"
>>> -            "[-s <samples>]       samples per seconds (default %d)\n"
>>> -            "[-e <command>]       command to profile\n"
>>> -            "[-o <file>]          output statistics to file. If file 
>>> is '-',"
>>> -            "                     run in batch mode and output 
>>> statistics to stdio only \n"
>>> -            "[-h]                 show this help screen\n"
>>> -            "\n",
>>> -            appname,
>>> -            SAMPLES_PER_SEC
>>> -          );
>>> -    return;
>>> +        "\n"
>>> +        "Usage: %s [parameters]\n"
>>> +        "\n"
>>> +        "\tThe following parameters are optional:\n"
>>> +        "\t[-s <samples>]       refresh period in ms (default %ums)\n"
>>> +        "\t[-h]                 show this help text\n"
>>> +        "\n",
>>> +        appname, DEFAULT_PERIOD_MS);
>>>   }
>>>   int main(int argc, char **argv)
>>>   {
>>> -    uint32_t devid;
>>> -    struct pci_device *pci_dev;
>>> -    struct ring render_ring = {
>>> -        .name = "render",
>>> -        .mmio = 0x2030,
>>> -    }, bsd_ring = {
>>> -        .name = "bitstream",
>>> -        .mmio = 0x4030,
>>> -    }, bsd6_ring = {
>>> -        .name = "bitstream",
>>> -        .mmio = 0x12030,
>>> -    }, blt_ring = {
>>> -        .name = "blitter",
>>> -        .mmio = 0x22030,
>>> -    };
>>> -    int i, ch;
>>> -    int samples_per_sec = SAMPLES_PER_SEC;
>>> -    FILE *output = NULL;
>>> -    double elapsed_time=0;
>>> -    int print_headers=1;
>>> -    pid_t child_pid=-1;
>>> -    int child_stat;
>>> -    char *cmd=NULL;
>>> -    int interactive=1;
>>> -
>>> -    /* Parse options? */
>>> -    while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
>>> +    unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
>>> +    int con_w = -1, con_h = -1;
>>> +    struct engines *engines;
>>> +    unsigned int i;
>>> +    int ret, ch;
>>> +
>>> +    /* Parse options */
>>> +    while ((ch = getopt(argc, argv, "s:h")) != -1) {
>>>           switch (ch) {
>>> -        case 'e': cmd = strdup(optarg);
>>> -            break;
>>> -        case 's': samples_per_sec = atoi(optarg);
>>> -            if (samples_per_sec < 100) {
>>> -                fprintf(stderr, "Error: samples per second must be 
>>> >= 100\n");
>>> -                exit(1);
>>> -            }
>>> -            break;
>>> -        case 'o':
>>> -            if (!strcmp(optarg, "-")) {
>>> -                /* Running in non-interactive mode */
>>> -                interactive = 0;
>>> -                output = stdout;
>>> -            }
>>> -            else
>>> -                output = fopen(optarg, "w");
>>> -            if (!output)
>>> -            {
>>> -                perror("fopen");
>>> -                exit(1);
>>> -            }
>>> +        case 's':
>>> +            period_us = atoi(optarg) * 1000;
>>>               break;
>>>           case 'h':
>>>               usage(argv[0]);
>>>               exit(0);
>>> -            break;
>>>           default:
>>> -            fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
>>> +            fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>>>               usage(argv[0]);
>>>               exit(1);
>>> -            break;
>>>           }
>>>       }
>>> -    pci_dev = intel_get_pci_device();
>>> -    devid = pci_dev->device_id;
>>> -    intel_mmio_use_pci_bar(pci_dev);
>>> -    init_instdone_definitions(devid);
>>> -
>>> -    /* Do we have a command to run? */
>>> -    if (cmd != NULL) {
>>> -        if (output) {
>>> -            fprintf(output, "# Profiling: %s\n", cmd);
>>> -            fflush(output);
>>> -        }
>>> -        child_pid = fork();
>>> -        if (child_pid < 0) {
>>> -            perror("fork");
>>> -            exit(1);
>>> -        }
>>> -        else if (child_pid == 0) {
>>> -            int res;
>>> -            res = system(cmd);
>>> -            if (res < 0)
>>> -                perror("running command");
>>> -            if (output) {
>>> -                fflush(output);
>>> -                fprintf(output, "# %s exited with status %d\n", cmd, 
>>> res);
>>> -                fflush(output);
>>> -            }
>>> -            free(cmd);
>>> -            exit(0);
>>> -        } else {
>>> -            free(cmd);
>>> -        }
>>> +    engines = discover_engines();
>>> +    if (!engines) {
>>> +        fprintf(stderr, "Failed to detect engines!\n");
>>> +        return 1;
>>>       }
>>> -    for (i = 0; i < num_instdone_bits; i++) {
>>> -        top_bits[i].bit = &instdone_bits[i];
>>> -        top_bits[i].count = 0;
>>> -        top_bits_sorted[i] = &top_bits[i];
>>> +    ret = pmu_init(engines);
>>> +    if (ret) {
>>> +        fprintf(stderr, "Failed to initialize PMU!\n");
>>> +        return 1;
>>>       }
>>> -    /* Grab access to the registers */
>>> -    intel_register_access_init(pci_dev, 0, -1);
>>> +    pmu_sample(engines);
>>> -    ring_init(&render_ring);
>>> -    if (IS_GEN4(devid) || IS_GEN5(devid))
>>> -        ring_init(&bsd_ring);
>>> -    if (IS_GEN6(devid) || IS_GEN7(devid)) {
>>> -        ring_init(&bsd6_ring);
>>> -        ring_init(&blt_ring);
>>> -    }
>>> +    for (;;) {
>>> +        double t, freq[2], irq, rc6, power;
>>> +        struct winsize ws;
>>> +        int lines = 0;
>>> -    /* Initialize GPU stats */
>>> -    if (HAS_STATS_REGS(devid)) {
>>> -        for (i = 0; i < STATS_COUNT; i++) {
>>> -            uint32_t stats_high, stats_low, stats_high_2;
>>> +        /* Update terminal size. */
>>> +        if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>>> +            con_w = ws.ws_col;
>>> +            con_h = ws.ws_row;
>>> +        }
>>> -            do {
>>> -                stats_high = INREG(stats_regs[i] + 4);
>>> -                stats_low = INREG(stats_regs[i]);
>>> -                stats_high_2 = INREG(stats_regs[i] + 4);
>>> -            } while (stats_high != stats_high_2);
>>> +        pmu_sample(engines);
>>> +        t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>>> -            last_stats[i] = (uint64_t)stats_high << 32 |
>>> -                stats_low;
>>> -        }
>>> -    }
>>> +        printf("\033[H\033[J");
>>> -    for (;;) {
>>> -        int j;
>>> -        unsigned long long t1, ti, tf, t2;
>>> -        unsigned long long def_sleep = 1000000 / samples_per_sec;
>>> -        unsigned long long last_samples_per_sec = samples_per_sec;
>>> -        unsigned short int max_lines;
>>> -        struct winsize ws;
>>> -        char clear_screen[] = {0x1b, '[', 'H',
>>> -                       0x1b, '[', 'J',
>>> -                       0x0};
>>> -        int percent;
>>> -        int len;
>>> -
>>> -        t1 = gettime();
>>> -
>>> -        ring_reset(&render_ring);
>>> -        ring_reset(&bsd_ring);
>>> -        ring_reset(&bsd6_ring);
>>> -        ring_reset(&blt_ring);
>>> -
>>> -        for (i = 0; i < samples_per_sec; i++) {
>>> -            long long interval;
>>> -            ti = gettime();
>>> -            if (IS_965(devid)) {
>>> -                instdone = INREG(INSTDONE_I965);
>>> -                instdone1 = INREG(INSTDONE_1);
>>> -            } else
>>> -                instdone = INREG(INSTDONE);
>>> -
>>> -            for (j = 0; j < num_instdone_bits; j++)
>>> -                update_idle_bit(&top_bits[j]);
>>> -
>>> -            ring_sample(&render_ring);
>>> -            ring_sample(&bsd_ring);
>>> -            ring_sample(&bsd6_ring);
>>> -            ring_sample(&blt_ring);
>>> -
>>> -            tf = gettime();
>>> -            if (tf - t1 >= 1000000) {
>>> -                /* We are out of sync, bail out */
>>> -                last_samples_per_sec = i+1;
>>> -                break;
>>> -            }
>>> -            interval = def_sleep - (tf - ti);
>>> -            if (interval > 0)
>>> -                usleep(interval);
>>> -        }
>>> +        freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
>>> +        freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
>>> +        irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
>>> +        rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
>>> +        power = pmu_calc(&engines->rapl.val, 1.0, t,
>>> +                 engines->rapl_scale);
>>> -        if (HAS_STATS_REGS(devid)) {
>>> -            for (i = 0; i < STATS_COUNT; i++) {
>>> -                uint32_t stats_high, stats_low, stats_high_2;
>>> +        printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; 
>>> %6.0fmW; %8.0f irqs/s\n",
>>> +               freq[0], freq[1], rc6, power, irq);
>>> +        lines++;
>>> -                do {
>>> -                    stats_high = INREG(stats_regs[i] + 4);
>>> -                    stats_low = INREG(stats_regs[i]);
>>> -                    stats_high_2 = INREG(stats_regs[i] + 4);
>>> -                } while (stats_high != stats_high_2);
>>> +        printf("\n");
>>> +        lines++;
>>> -                stats[i] = (uint64_t)stats_high << 32 |
>>> -                    stats_low;
>>> -            }
>>> -        }
>>> +        for (i = 0; i < engines->num_engines && lines < con_h; i++) {
>>> +            struct engine *engine = engine_ptr(engines, i);
>>> +            unsigned int max_w = con_w - 1;
>>> +            unsigned int len;
>>> +            double val[2];
>>> +            char buf[128];
>>> -        qsort(top_bits_sorted, num_instdone_bits,
>>> -              sizeof(struct top_bit *), top_bits_sort);
>>> -
>>> -        /* Limit the number of lines printed to the terminal height 
>>> so the
>>> -         * most important info (at the top) will stay on screen. */
>>> -        max_lines = -1;
>>> -        if (ioctl(0, TIOCGWINSZ, &ws) != -1)
>>> -            max_lines = ws.ws_row - 6; /* exclude header lines */
>>> -        if (max_lines >= num_instdone_bits)
>>> -            max_lines = num_instdone_bits;
>>> -
>>> -        t2 = gettime();
>>> -        elapsed_time += (t2 - t1) / 1000000.0;
>>> -
>>> -        if (interactive) {
>>> -            printf("%s", clear_screen);
>>> -            print_clock_info(pci_dev);
>>> -
>>> -            ring_print(&render_ring, last_samples_per_sec);
>>> -            ring_print(&bsd_ring, last_samples_per_sec);
>>> -            ring_print(&bsd6_ring, last_samples_per_sec);
>>> -            ring_print(&blt_ring, last_samples_per_sec);
>>> -
>>> -            printf("\n%30s  %s\n", "task", "percent busy");
>>> -            for (i = 0; i < max_lines; i++) {
>>> -                if (top_bits_sorted[i]->count > 0) {
>>> -                    percent = (top_bits_sorted[i]->count * 100) /
>>> -                        last_samples_per_sec;
>>> -                    len = printf("%30s: %3d%%: ",
>>> -                             top_bits_sorted[i]->bit->name,
>>> -                             percent);
>>> -                    print_percentage_bar (percent, len);
>>> -                } else {
>>> -                    printf("%*s", PERCENTAGE_BAR_END, "");
>>> -                }
>>> -
>>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>>> -                    printf("%13s: %llu (%lld/sec)",
>>> -                           stats_reg_names[i],
>>> -                           (long long)stats[i],
>>> -                           (long long)(stats[i] - last_stats[i]));
>>> -                    last_stats[i] = stats[i];
>>> -                } else {
>>> -                    if (!top_bits_sorted[i]->count)
>>> -                        break;
>>> -                }
>>> -                printf("\n");
>>> -            }
>>> -        }
>>> -        if (output) {
>>> -            /* Print headers for columns at first run */
>>> -            if (print_headers) {
>>> -                fprintf(output, "# time\t");
>>> -                ring_print_header(output, &render_ring);
>>> -                ring_print_header(output, &bsd_ring);
>>> -                ring_print_header(output, &bsd6_ring);
>>> -                ring_print_header(output, &blt_ring);
>>> -                for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>>> -                    if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>>> -                        fprintf(output, "%.6s\t",
>>> -                               stats_reg_names[i]
>>> -                               );
>>> -                    }
>>> -                    if (!top_bits[i].count)
>>> -                        continue;
>>> -                }
>>> -                fprintf(output, "\n");
>>> -                print_headers = 0;
>>> -            }
>>> -
>>> -            /* Print statistics */
>>> -            fprintf(output, "%.2f\t", elapsed_time);
>>> -            ring_log(&render_ring, last_samples_per_sec, output);
>>> -            ring_log(&bsd_ring, last_samples_per_sec, output);
>>> -            ring_log(&bsd6_ring, last_samples_per_sec, output);
>>> -            ring_log(&blt_ring, last_samples_per_sec, output);
>>> -
>>> -            for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>>> -                if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>>> -                    fprintf(output, "%"PRIu64"\t",
>>> -                           stats[i] - last_stats[i]);
>>> -                    last_stats[i] = stats[i];
>>> -                }
>>> -                    if (!top_bits[i].count)
>>> -                        continue;
>>> -            }
>>> -            fprintf(output, "\n");
>>> -            fflush(output);
>>> -        }
>>> +            val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
>>> +            val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
>>> +            len = snprintf(buf, sizeof(buf),
>>> +                       "%6.2f%% wait, %6.2f%% sema",
>>> +                       val[0], val[1]);
>>> -        for (i = 0; i < num_instdone_bits; i++) {
>>> -            top_bits_sorted[i]->count = 0;
>>> +            val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
>>> +            len += printf("%8s %6.2f%% ",
>>> +                      engine->name, val[0]);
>>> +            print_percentage_bar(val[0], max_w - len);
>>> -            if (i < STATS_COUNT)
>>> -                last_stats[i] = stats[i];
>>> -        }
>>> +            printf("%s\n", buf);
>>> -        /* Check if child has gone */
>>> -        if (child_pid > 0) {
>>> -            int res;
>>> -            if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == 
>>> -1) {
>>> -                perror("waitpid");
>>> -                exit(1);
>>> -            }
>>> -            if (res == 0)
>>> -                continue;
>>> -            if (WIFEXITED(child_stat))
>>> -                break;
>>> +            lines++;
>>>           }
>>> -    }
>>> -    fclose(output);
>>> +        printf("\n");
>>> +
>>> +        usleep(period_us);
>>> +    }
>>> -    intel_register_access_fini();
>>>       return 0;
>>>   }
>>> diff --git a/tools/meson.build b/tools/meson.build
>>> index bd2d313d5156..a918eeb0bef1 100644
>>> --- a/tools/meson.build
>>> +++ b/tools/meson.build
>>> @@ -23,7 +23,6 @@ tools_progs = [
>>>       'intel_gpu_frequency',
>>>       'intel_firmware_decode',
>>>       'intel_gpu_time',
>>> -    'intel_gpu_top',
>>>       'intel_gtt',
>>>       'intel_guc_logger',
>>>       'intel_infoframes',
>>> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>>>              name_prefix : '',
>>>              install : true)
>>> +executable('intel_gpu_top', 'intel_gpu_top.c',
>>> +       install : true,
>>> +       install_rpath : rpathdir,
>>> +       dependencies : tool_deps + [ lib_igt_perf ])
>>> +
>>>   conf_data = configuration_data()
>>>   conf_data.set('prefix', prefix)
>>>   conf_data.set('exec_prefix', '${prefix}')
>>>
>>
>> _______________________________________________
>> igt-dev mailing list
>> igt-dev@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/igt-dev

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-03 14:06         ` [Intel-gfx] " Eero Tamminen
@ 2018-04-03 17:18           ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03 17:18 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 03/04/2018 15:06, Eero Tamminen wrote:
> Hi,
> 
> On 03.04.2018 12:36, Tvrtko Ursulin wrote:
>> On 29/03/2018 15:30, Eero Tamminen wrote:
>>> I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
>>> with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
>>> and latest drm-tip build (4.16.0-rc7).
>>>
>>>
>>> General comments
>>> ----------------
>>>
>>> This will be used by our customers and people who aren't necessarily
>>> familiar with i915 internal details.  Therefore it should use
>>> common terminology in the field and in similar tools, instead of
>>> I3As (Intel 3-letter Acronyms).
>>>
>>> For example:
>>>   - rcs -> 3D render
>>>   - bcs -> blitter
>>>   - vecs -> video
>>>   - vcs -> video decode
>>> etc.
>>
>> Done. And I am open to bike-shedding of the names and display format 
>> for instance reporting.
> 
> New names look fine to me!
> 
> 
>>> Old tool showed also GPU system memory interface (GAM) busyness.
>>> That was valuable info, and reasonably accurate for stable loads.
>>>
>>> Could this tool show also either that information (preferred), or
>>> bandwidth utilized by GPU/CPU/display?
>>>
>>> (Latest kernels offer GPU memory bandwidth usage through perf
>>> "uncore_imc" "data_reads" & "date_writes" counters.)
>>
>> Excellent suggestion and I've added IMC data_reads and data_writes to 
>> the tool.
> 
> Thanks, it looks fine too.  I'm just wondering about the numbers
> it's reporting on SKL GT2...
> 
> AFAIK IMC counters are for uncore, so I though that they should
> correspond to GTI (memory interface to outside of GPU) read and
> write HW counter values.  While it seemed in some cases quite close,
> in some cases the it showed a lot smaller (2/3) value than expected.
> 
> I can understand why reads are sometimes larger, because I think
> uncore will include also display engine display content reads.
> 
> However, I don't see how uncore writes could be considerably smaller
> than the GTI interface write amount.
> 
> (GTI interface reports the expected value which corresponds directly
> to what my test application is doing (64x blended FullHD layer writes).)
> 
> Idle machine read amounts are also much smaller (60-65MB/s) than what
> I think display update read should be (1920*1080*4*60Hz = 475MiB/s).
> 
> Any ideas for these two discrepancies?

I'm afraid I am not familiar with the uncore IMC, but we could always 
approach its authors?

>>> Is "wait" value supposed to be IO-wait for given engine interface?
>>>
>>> I never saw that change from 0%, although IO-wait in top jumped
>>> from 0 to 20-30% with my test GPU load.
>>
>> No, that is time spent in MI_WAIT_FOR_EVENT.
> 
> Could you add that info to the UI?
> 
> E.g. just have "MI" on top of the "wait" column.

Like a full header strip? Yeah makes sense, I'll add it.

>  > I think not very used in current codebase.
> 
> What you're using to validate that it reports correct value?

That would be igt/tests/perf_pmu/event-wait-rcs0.

> 
>>> HW specific test results
>>> ------------------------
>>>
>>> BYT:
>>> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>>>    works fine.
>>>
>>> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
>>> for the "wait" value.
>>>
>>> I never saw blitter engine to do anything, but that's because
>>> modesetting uses just 3D pipeline, and because I couldn't get
>>> Intel DDX to work with rest of latest git version of X / 3D stack.
>>
>> Thank you for testing this so thoroughly - this was really invaluable 
>> since I don't have access too such number of platforms. I've tried to 
>> fix all this in the latest version.
> 
> Machines are currently running tests, I'll check these tomorrow.

Thanks!

> 
>>> Kernel version support
>>> ----------------------
>>>
>>> My HW specific testing above was with drm-tip kernel, but I did one test
>>> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
>>> backport) on KBL.  For that, the tool reported:
>>> "Failed to detect engines!"
>>>
>>> Although the previous intel_gpu_top works fine with that kernel version.
>>>
>>> Same happens also with Ubuntu 17.04 v4.13 kernel.
>>>
>>>
>>> -> If new version needs a certain kernel version, it should tell
>>>     which version is required.
>>
>> Yep, at least 4.16 is needed so I have added this info to the error 
>> message.
> 
> IMHO the message is a bit ambivalent:
>      Failed to detect engines! Kernel 4.16 or newer?
> 
> I would suggest checking whether kernel is new enough, and if not:
>      Kernel X.YY detected, 4.16 or newer required.

Maybe yeah. I was planning to improve error messages altogether but 
forgot. Will see what improvements make sense.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-03 17:18           ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-03 17:18 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin


On 03/04/2018 15:06, Eero Tamminen wrote:
> Hi,
> 
> On 03.04.2018 12:36, Tvrtko Ursulin wrote:
>> On 29/03/2018 15:30, Eero Tamminen wrote:
>>> I tested this on HSW GT2, BYT, BDW GT3, SKL GT2 and KBL GT3e,
>>> with Ubuntu 16.04 and 17.10, using Ubuntu default kernels (4.4 to 4.13)
>>> and latest drm-tip build (4.16.0-rc7).
>>>
>>>
>>> General comments
>>> ----------------
>>>
>>> This will be used by our customers and people who aren't necessarily
>>> familiar with i915 internal details.  Therefore it should use
>>> common terminology in the field and in similar tools, instead of
>>> I3As (Intel 3-letter Acronyms).
>>>
>>> For example:
>>>   - rcs -> 3D render
>>>   - bcs -> blitter
>>>   - vecs -> video
>>>   - vcs -> video decode
>>> etc.
>>
>> Done. And I am open to bike-shedding of the names and display format 
>> for instance reporting.
> 
> New names look fine to me!
> 
> 
>>> Old tool showed also GPU system memory interface (GAM) busyness.
>>> That was valuable info, and reasonably accurate for stable loads.
>>>
>>> Could this tool show also either that information (preferred), or
>>> bandwidth utilized by GPU/CPU/display?
>>>
>>> (Latest kernels offer GPU memory bandwidth usage through perf
>>> "uncore_imc" "data_reads" & "date_writes" counters.)
>>
>> Excellent suggestion and I've added IMC data_reads and data_writes to 
>> the tool.
> 
> Thanks, it looks fine too.  I'm just wondering about the numbers
> it's reporting on SKL GT2...
> 
> AFAIK IMC counters are for uncore, so I though that they should
> correspond to GTI (memory interface to outside of GPU) read and
> write HW counter values.  While it seemed in some cases quite close,
> in some cases the it showed a lot smaller (2/3) value than expected.
> 
> I can understand why reads are sometimes larger, because I think
> uncore will include also display engine display content reads.
> 
> However, I don't see how uncore writes could be considerably smaller
> than the GTI interface write amount.
> 
> (GTI interface reports the expected value which corresponds directly
> to what my test application is doing (64x blended FullHD layer writes).)
> 
> Idle machine read amounts are also much smaller (60-65MB/s) than what
> I think display update read should be (1920*1080*4*60Hz = 475MiB/s).
> 
> Any ideas for these two discrepancies?

I'm afraid I am not familiar with the uncore IMC, but we could always 
approach its authors?

>>> Is "wait" value supposed to be IO-wait for given engine interface?
>>>
>>> I never saw that change from 0%, although IO-wait in top jumped
>>> from 0 to 20-30% with my test GPU load.
>>
>> No, that is time spent in MI_WAIT_FOR_EVENT.
> 
> Could you add that info to the UI?
> 
> E.g. just have "MI" on top of the "wait" column.

Like a full header strip? Yeah makes sense, I'll add it.

>  > I think not very used in current codebase.
> 
> What you're using to validate that it reports correct value?

That would be igt/tests/perf_pmu/event-wait-rcs0.

> 
>>> HW specific test results
>>> ------------------------
>>>
>>> BYT:
>>> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>>>    works fine.
>>>
>>> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
>>> for the "wait" value.
>>>
>>> I never saw blitter engine to do anything, but that's because
>>> modesetting uses just 3D pipeline, and because I couldn't get
>>> Intel DDX to work with rest of latest git version of X / 3D stack.
>>
>> Thank you for testing this so thoroughly - this was really invaluable 
>> since I don't have access too such number of platforms. I've tried to 
>> fix all this in the latest version.
> 
> Machines are currently running tests, I'll check these tomorrow.

Thanks!

> 
>>> Kernel version support
>>> ----------------------
>>>
>>> My HW specific testing above was with drm-tip kernel, but I did one test
>>> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
>>> backport) on KBL.  For that, the tool reported:
>>> "Failed to detect engines!"
>>>
>>> Although the previous intel_gpu_top works fine with that kernel version.
>>>
>>> Same happens also with Ubuntu 17.04 v4.13 kernel.
>>>
>>>
>>> -> If new version needs a certain kernel version, it should tell
>>>     which version is required.
>>
>> Yep, at least 4.16 is needed so I have added this info to the error 
>> message.
> 
> IMHO the message is a bit ambivalent:
>      Failed to detect engines! Kernel 4.16 or newer?
> 
> I would suggest checking whether kernel is new enough, and if not:
>      Kernel X.YY detected, 4.16 or newer required.

Maybe yeah. I was planning to improve error messages altogether but 
forgot. Will see what improvements make sense.

Regards,

Tvrtko
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH i-g-t v5] intel-gpu-top: Rewrite the tool to be safe to use
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-04-04  9:48   ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04  9:48 UTC (permalink / raw)
  To: igt-dev; +Cc: Rinat Ibragimov, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

v4:
 * Really support 4.16 by skipping of missing engines.
 * Simpler and less hacky float printing.
 * Preserve copyright header. (Antonio Argenziano)
 * Simplify engines_ptr macro. (Rinat Ibragimov)

v5:
 * Get RAPL unit from sysfs.
 * Consolidate sysfs paths with a macro.
 * Tidy error handling by carrying over and reporting errno.
 * Check against console height on all prints.
 * More readable minimum kernel version message. (Eero Tamminen)
 * Column banner for per engine stats. (Eero Tamminen)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
 tools/meson.build     |    6 +-
 5 files changed, 707 insertions(+), 558 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..577ae1269b13 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2007-2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,695 +23,832 @@
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	unsigned int num_counters;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+	const char *rapl_unit;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) (&engines->engine + (n))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = ENAMETOOLONG;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = errno;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = ENOENT;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = ENOBUFS;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = errno;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = errno;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret) {
+		free(engines);
+		errno = ret;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		return NULL;
 	}
 
+	qsort(engine_ptr(engines, 0), engines->num_engines,
+	      sizeof(struct engine), engine_cmp);
+
+	engines->root = d;
 
-	printf("\n");
-	return -1;
+	return engines;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd, err;
+	ssize_t ret;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	err = errno;
+	close(fd);
+	if (ret < 1) {
+		errno = ret < 0 ? err : ENOMSG;
+
+		return -1;
+	}
+
+	if (ret > 1 && buf[ret - 1] == '\n')
+		buf[ret - 1] = '\0';
+	else
+		buf[ret] = '\0';
+
+	return 0;
+}
+
+static uint64_t filename_to_u64(const char *filename, int base)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char buf[64], *b;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
+}
+
+#define RAPL_ROOT "/sys/devices/power/"
+#define RAPL_EVENT "/sys/devices/power/events/"
+
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64(RAPL_ROOT "type", 10);
+}
+
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
+}
+
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double(RAPL_EVENT "energy-gpu.scale");
+}
+
+static const char *rapl_gpu_power_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
+			    buf, sizeof(buf)) == 0)
+		if (!strcmp(buf, "Joules"))
+			return strdup("Watts");
+		else
+			return strdup(buf);
+	else
+		return NULL;
+}
+
+#define IMC_ROOT "/sys/devices/uncore_imc/"
+#define IMC_EVENT "/sys/devices/uncore_imc/events/"
+
+static uint64_t imc_type_id(void)
+{
+	return filename_to_u64(IMC_ROOT "type", 10);
+}
+
+static uint64_t imc_data_reads(void)
+{
+	return filename_to_u64(IMC_EVENT "data_reads", 0);
+}
+
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_reads.scale");
+}
+
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64(IMC_EVENT "data_writes", 0);
+}
+
+static double imc_data_writes_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_writes.scale");
+}
+
+static const char *imc_data_writes_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd >= 0)
+				engine->num_counters++;
+		}
 	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		engines->rapl_unit = rapl_gpu_power_unit();
+		if (!engines->rapl_unit)
+			return -1;
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
 	}
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
-}
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
+
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	return INREG(ring->mmio + reg);
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_init(struct ring *ring)
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
 }
 
-static void ring_reset(struct ring *ring)
+static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
 {
-	ring->idle = ring->full = 0;
+	unsigned int i;
+
+	for (i = 0; i < num && i < (bufsz - 1); i++)
+		*buf++ = c;
+
+	*buf = 0;
 }
 
-static void ring_sample(struct ring *ring)
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz,
+		     unsigned int width, unsigned width_dec,
+		     double d, double t, double s)
 {
-	int full;
+	double val;
+	int len;
+
+	assert(bufsz >= (width + width_dec + 1));
+
+	if (!cnt->present) {
+		fill_str(buf, bufsz, '-', width + width_dec);
+		return;
+	}
 
-	if (!ring->size)
+	val = __pmu_calc(&cnt->val, d, t, s);
+
+	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
+	if (len < 0 || len == bufsz) {
+		fill_str(buf, bufsz, 'X', width + width_dec);
 		return;
+	}
+}
+
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
+{
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (ts)
+		*ts = data[1];
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+	return data[0];
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static uint64_t pmu_read_single(int fd)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return __pmu_read_single(fd, NULL);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
 {
-	int percent_busy, len;
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
 
-	if (!ring->size)
-		return;
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
+			strerror(errno));
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! (%s)\n", strerror(errno));
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
+		struct winsize ws;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
+		}
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_writes_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		if (lines++ < con_h)
+			printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
+			       fact, freq, rc6, power, engines->rapl_unit, irq);
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		if (lines++ < con_h)
+			printf("\n");
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		if (engines->imc_fd) {
+			if (lines++ < con_h)
+				printf("      IMC reads:   %s %s/s\n",
+				       reads, engines->imc_reads_unit);
+
+			if (lines++ < con_h)
+				printf("     IMC writes:   %s %s/s\n",
+				       writes, engines->imc_writes_unit);
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
+			if (++lines < con_h)
 				printf("\n");
-			}
 		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
 
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+		for (i = 0; i < engines->num_engines; i++) {
+			struct engine *engine = engine_ptr(engines, i);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			if (engine->num_counters && lines < con_h) {
+				const char *a = "          ENGINE      BUSY ";
+				const char *b = " MI_SEMA MI_WAIT";
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
+				printf("\033[7m%s%*s%s\033[0m\n",
+				       a,
+				       (int)(con_w - 1 - strlen(a) - strlen(b)),
+				       " ", b);
+				lines++;
+				break;
+			}
 		}
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			if (!engine->num_counters)
 				continue;
-			if (WIFEXITED(child_stat))
-				break;
+
+			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
+				       sema, wait);
+
+			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
+
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
+
+			printf("%s\n", buf);
+
+			lines++;
 		}
-	}
 
-	fclose(output);
+		if (lines++ < con_h)
+			printf("\n");
+
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [igt-dev] [PATCH i-g-t v5] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-04  9:48   ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04  9:48 UTC (permalink / raw)
  To: igt-dev; +Cc: Tvrtko Ursulin, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

v4:
 * Really support 4.16 by skipping of missing engines.
 * Simpler and less hacky float printing.
 * Preserve copyright header. (Antonio Argenziano)
 * Simplify engines_ptr macro. (Rinat Ibragimov)

v5:
 * Get RAPL unit from sysfs.
 * Consolidate sysfs paths with a macro.
 * Tidy error handling by carrying over and reporting errno.
 * Check against console height on all prints.
 * More readable minimum kernel version message. (Eero Tamminen)
 * Column banner for per engine stats. (Eero Tamminen)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
 tools/meson.build     |    6 +-
 5 files changed, 707 insertions(+), 558 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..577ae1269b13 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2007-2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,695 +23,832 @@
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	unsigned int num_counters;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+	const char *rapl_unit;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) (&engines->engine + (n))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = ENAMETOOLONG;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = errno;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = ENOENT;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = ENOBUFS;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = errno;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = errno;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret) {
+		free(engines);
+		errno = ret;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		return NULL;
 	}
 
+	qsort(engine_ptr(engines, 0), engines->num_engines,
+	      sizeof(struct engine), engine_cmp);
+
+	engines->root = d;
 
-	printf("\n");
-	return -1;
+	return engines;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd, err;
+	ssize_t ret;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	err = errno;
+	close(fd);
+	if (ret < 1) {
+		errno = ret < 0 ? err : ENOMSG;
+
+		return -1;
+	}
+
+	if (ret > 1 && buf[ret - 1] == '\n')
+		buf[ret - 1] = '\0';
+	else
+		buf[ret] = '\0';
+
+	return 0;
+}
+
+static uint64_t filename_to_u64(const char *filename, int base)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char buf[64], *b;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
+}
+
+#define RAPL_ROOT "/sys/devices/power/"
+#define RAPL_EVENT "/sys/devices/power/events/"
+
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64(RAPL_ROOT "type", 10);
+}
+
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
+}
+
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double(RAPL_EVENT "energy-gpu.scale");
+}
+
+static const char *rapl_gpu_power_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
+			    buf, sizeof(buf)) == 0)
+		if (!strcmp(buf, "Joules"))
+			return strdup("Watts");
+		else
+			return strdup(buf);
+	else
+		return NULL;
+}
+
+#define IMC_ROOT "/sys/devices/uncore_imc/"
+#define IMC_EVENT "/sys/devices/uncore_imc/events/"
+
+static uint64_t imc_type_id(void)
+{
+	return filename_to_u64(IMC_ROOT "type", 10);
+}
+
+static uint64_t imc_data_reads(void)
+{
+	return filename_to_u64(IMC_EVENT "data_reads", 0);
+}
+
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_reads.scale");
+}
+
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64(IMC_EVENT "data_writes", 0);
+}
+
+static double imc_data_writes_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_writes.scale");
+}
+
+static const char *imc_data_writes_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd >= 0)
+				engine->num_counters++;
+		}
 	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		engines->rapl_unit = rapl_gpu_power_unit();
+		if (!engines->rapl_unit)
+			return -1;
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
 	}
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
-}
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
+
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	return INREG(ring->mmio + reg);
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_init(struct ring *ring)
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
 }
 
-static void ring_reset(struct ring *ring)
+static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
 {
-	ring->idle = ring->full = 0;
+	unsigned int i;
+
+	for (i = 0; i < num && i < (bufsz - 1); i++)
+		*buf++ = c;
+
+	*buf = 0;
 }
 
-static void ring_sample(struct ring *ring)
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz,
+		     unsigned int width, unsigned width_dec,
+		     double d, double t, double s)
 {
-	int full;
+	double val;
+	int len;
+
+	assert(bufsz >= (width + width_dec + 1));
+
+	if (!cnt->present) {
+		fill_str(buf, bufsz, '-', width + width_dec);
+		return;
+	}
 
-	if (!ring->size)
+	val = __pmu_calc(&cnt->val, d, t, s);
+
+	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
+	if (len < 0 || len == bufsz) {
+		fill_str(buf, bufsz, 'X', width + width_dec);
 		return;
+	}
+}
+
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
+{
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (ts)
+		*ts = data[1];
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+	return data[0];
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static uint64_t pmu_read_single(int fd)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return __pmu_read_single(fd, NULL);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
 {
-	int percent_busy, len;
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
 
-	if (!ring->size)
-		return;
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n"
+		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
+		"\t[-h]                 show this help text\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
+			strerror(errno));
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! (%s)\n", strerror(errno));
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
+		struct winsize ws;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
+		}
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_writes_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		if (lines++ < con_h)
+			printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
+			       fact, freq, rc6, power, engines->rapl_unit, irq);
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		if (lines++ < con_h)
+			printf("\n");
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		if (engines->imc_fd) {
+			if (lines++ < con_h)
+				printf("      IMC reads:   %s %s/s\n",
+				       reads, engines->imc_reads_unit);
+
+			if (lines++ < con_h)
+				printf("     IMC writes:   %s %s/s\n",
+				       writes, engines->imc_writes_unit);
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
+			if (++lines < con_h)
 				printf("\n");
-			}
 		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
 
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+		for (i = 0; i < engines->num_engines; i++) {
+			struct engine *engine = engine_ptr(engines, i);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			if (engine->num_counters && lines < con_h) {
+				const char *a = "          ENGINE      BUSY ";
+				const char *b = " MI_SEMA MI_WAIT";
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
+				printf("\033[7m%s%*s%s\033[0m\n",
+				       a,
+				       (int)(con_w - 1 - strlen(a) - strlen(b)),
+				       " ", b);
+				lines++;
+				break;
+			}
 		}
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			if (!engine->num_counters)
 				continue;
-			if (WIFEXITED(child_stat))
-				break;
+
+			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
+				       sema, wait);
+
+			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
+
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
+
+			printf("%s\n", buf);
+
+			lines++;
 		}
-	}
 
-	fclose(output);
+		if (lines++ < con_h)
+			printf("\n");
+
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-03 17:18           ` Tvrtko Ursulin
@ 2018-04-04 12:15             ` Eero Tamminen
  -1 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-04-04 12:15 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Hi,

I've now tested v5 with old Ubuntu kernel on KBL, and with latest
drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.


Generic test results
--------------------

* Tool works on all of them

* The new error messages and headings look good

* Idle IMC read amounts correspond to expected values on SNB & HSW.
   The much smaller values on BDW & SKL are due to FBC (how well
   it compresses, naturally depends on screen content).


BYT & BSW
---------

* IMC, power usage and actual(?) freq values are missing.

-> You can get actual freq by polling CAGF register, represented by:
	/sys/class/drm/card0/gt_act_freq_mhz

Normally i915 driver maps uncore power usage to GPU power usage,
but BYT is missing that (and ram power usage).  However, RAPL
does report package & core values...


Suggestions
-----------

Maybe on platforms where RAPL doesn't report "uncore" power usage,
you could just deduct RAPL reported "core" power consumption from
the "package" power consumption, and report that as "GPU" power
usage?  (Or do that in i915 directly)


You need also to either update the manual, or implement -o and -e
options for the new version of intel_gpu_top.  CSV output of all
the reported values would be nice.

You might mention in manual as an example how to calculate
idle screen update bandwidth, and that it's impacted by:
- PSR (panel self refresh, depends on display supporting it):
   /sys/kernel/debug/dri/0/i915_edp_psr_status
- FBC (frame buffer compression, enabled on newer GENs)
   /sys/kernel/debug/dri/0/i915_fbc_status
- end-to-end RBC (render buffer compression, requires modifiers
   support i.e. GEN9+ GPU and X & Mesa with DRI3 v1.2 [1] support)


	- Eero

[1] Requires building latest git versions of Mesa, libxcb, X server
and few other things, and adding this to X server config:
-------------------------------
Section "ServerFlags"
     Option "Debug" "dmabuf_capable"
EndSection
-------------------------------


On 03.04.2018 20:18, Tvrtko Ursulin wrote:
> On 03/04/2018 15:06, Eero Tamminen wrote:
>> On 03.04.2018 12:36, Tvrtko Ursulin wrote:
>>> On 29/03/2018 15:30, Eero Tamminen wrote:
[...]
>>>> Old tool showed also GPU system memory interface (GAM) busyness.
>>>> That was valuable info, and reasonably accurate for stable loads.
>>>>
>>>> Could this tool show also either that information (preferred), or
>>>> bandwidth utilized by GPU/CPU/display?
>>>>
>>>> (Latest kernels offer GPU memory bandwidth usage through perf
>>>> "uncore_imc" "data_reads" & "date_writes" counters.)
>>>
>>> Excellent suggestion and I've added IMC data_reads and data_writes to 
>>> the tool.
>>
>> Thanks, it looks fine too.  I'm just wondering about the numbers
>> it's reporting on SKL GT2...
>>
>> AFAIK IMC counters are for uncore, so I though that they should
>> correspond to GTI (memory interface to outside of GPU) read and
>> write HW counter values.  While it seemed in some cases quite close,
>> in some cases the it showed a lot smaller (2/3) value than expected.
>>
>> I can understand why reads are sometimes larger, because I think
>> uncore will include also display engine display content reads.
>>
>> However, I don't see how uncore writes could be considerably smaller
>> than the GTI interface write amount.
>>
>> (GTI interface reports the expected value which corresponds directly
>> to what my test application is doing (64x blended FullHD layer writes).)
>>
>> Idle machine read amounts are also much smaller (60-65MB/s) than what
>> I think display update read should be (1920*1080*4*60Hz = 475MiB/s).
>>
>> Any ideas for these two discrepancies?
> 
> I'm afraid I am not familiar with the uncore IMC, but we could always 
> approach its authors?
> 
>>>> Is "wait" value supposed to be IO-wait for given engine interface?
>>>>
>>>> I never saw that change from 0%, although IO-wait in top jumped
>>>> from 0 to 20-30% with my test GPU load.
>>>
>>> No, that is time spent in MI_WAIT_FOR_EVENT.
>>
>> Could you add that info to the UI?
>>
>> E.g. just have "MI" on top of the "wait" column.
> 
> Like a full header strip? Yeah makes sense, I'll add it.
> 
>>  > I think not very used in current codebase.
>>
>> What you're using to validate that it reports correct value?
> 
> That would be igt/tests/perf_pmu/event-wait-rcs0.
> 
>>
>>>> HW specific test results
>>>> ------------------------
>>>>
>>>> BYT:
>>>> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>>>>    works fine.
>>>>
>>>> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
>>>> for the "wait" value.
>>>>
>>>> I never saw blitter engine to do anything, but that's because
>>>> modesetting uses just 3D pipeline, and because I couldn't get
>>>> Intel DDX to work with rest of latest git version of X / 3D stack.
>>>
>>> Thank you for testing this so thoroughly - this was really invaluable 
>>> since I don't have access too such number of platforms. I've tried to 
>>> fix all this in the latest version.
>>
>> Machines are currently running tests, I'll check these tomorrow.
> 
> Thanks!
> 
>>
>>>> Kernel version support
>>>> ----------------------
>>>>
>>>> My HW specific testing above was with drm-tip kernel, but I did one 
>>>> test
>>>> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
>>>> backport) on KBL.  For that, the tool reported:
>>>> "Failed to detect engines!"
>>>>
>>>> Although the previous intel_gpu_top works fine with that kernel 
>>>> version.
>>>>
>>>> Same happens also with Ubuntu 17.04 v4.13 kernel.
>>>>
>>>>
>>>> -> If new version needs a certain kernel version, it should tell
>>>>     which version is required.
>>>
>>> Yep, at least 4.16 is needed so I have added this info to the error 
>>> message.
>>
>> IMHO the message is a bit ambivalent:
>>      Failed to detect engines! Kernel 4.16 or newer?
>>
>> I would suggest checking whether kernel is new enough, and if not:
>>      Kernel X.YY detected, 4.16 or newer required.
> 
> Maybe yeah. I was planning to improve error messages altogether but 
> forgot. Will see what improvements make sense.
> 
> Regards,
> 
> Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-04 12:15             ` Eero Tamminen
  0 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-04-04 12:15 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Hi,

I've now tested v5 with old Ubuntu kernel on KBL, and with latest
drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.


Generic test results
--------------------

* Tool works on all of them

* The new error messages and headings look good

* Idle IMC read amounts correspond to expected values on SNB & HSW.
   The much smaller values on BDW & SKL are due to FBC (how well
   it compresses, naturally depends on screen content).


BYT & BSW
---------

* IMC, power usage and actual(?) freq values are missing.

-> You can get actual freq by polling CAGF register, represented by:
	/sys/class/drm/card0/gt_act_freq_mhz

Normally i915 driver maps uncore power usage to GPU power usage,
but BYT is missing that (and ram power usage).  However, RAPL
does report package & core values...


Suggestions
-----------

Maybe on platforms where RAPL doesn't report "uncore" power usage,
you could just deduct RAPL reported "core" power consumption from
the "package" power consumption, and report that as "GPU" power
usage?  (Or do that in i915 directly)


You need also to either update the manual, or implement -o and -e
options for the new version of intel_gpu_top.  CSV output of all
the reported values would be nice.

You might mention in manual as an example how to calculate
idle screen update bandwidth, and that it's impacted by:
- PSR (panel self refresh, depends on display supporting it):
   /sys/kernel/debug/dri/0/i915_edp_psr_status
- FBC (frame buffer compression, enabled on newer GENs)
   /sys/kernel/debug/dri/0/i915_fbc_status
- end-to-end RBC (render buffer compression, requires modifiers
   support i.e. GEN9+ GPU and X & Mesa with DRI3 v1.2 [1] support)


	- Eero

[1] Requires building latest git versions of Mesa, libxcb, X server
and few other things, and adding this to X server config:
-------------------------------
Section "ServerFlags"
     Option "Debug" "dmabuf_capable"
EndSection
-------------------------------


On 03.04.2018 20:18, Tvrtko Ursulin wrote:
> On 03/04/2018 15:06, Eero Tamminen wrote:
>> On 03.04.2018 12:36, Tvrtko Ursulin wrote:
>>> On 29/03/2018 15:30, Eero Tamminen wrote:
[...]
>>>> Old tool showed also GPU system memory interface (GAM) busyness.
>>>> That was valuable info, and reasonably accurate for stable loads.
>>>>
>>>> Could this tool show also either that information (preferred), or
>>>> bandwidth utilized by GPU/CPU/display?
>>>>
>>>> (Latest kernels offer GPU memory bandwidth usage through perf
>>>> "uncore_imc" "data_reads" & "date_writes" counters.)
>>>
>>> Excellent suggestion and I've added IMC data_reads and data_writes to 
>>> the tool.
>>
>> Thanks, it looks fine too.  I'm just wondering about the numbers
>> it's reporting on SKL GT2...
>>
>> AFAIK IMC counters are for uncore, so I though that they should
>> correspond to GTI (memory interface to outside of GPU) read and
>> write HW counter values.  While it seemed in some cases quite close,
>> in some cases the it showed a lot smaller (2/3) value than expected.
>>
>> I can understand why reads are sometimes larger, because I think
>> uncore will include also display engine display content reads.
>>
>> However, I don't see how uncore writes could be considerably smaller
>> than the GTI interface write amount.
>>
>> (GTI interface reports the expected value which corresponds directly
>> to what my test application is doing (64x blended FullHD layer writes).)
>>
>> Idle machine read amounts are also much smaller (60-65MB/s) than what
>> I think display update read should be (1920*1080*4*60Hz = 475MiB/s).
>>
>> Any ideas for these two discrepancies?
> 
> I'm afraid I am not familiar with the uncore IMC, but we could always 
> approach its authors?
> 
>>>> Is "wait" value supposed to be IO-wait for given engine interface?
>>>>
>>>> I never saw that change from 0%, although IO-wait in top jumped
>>>> from 0 to 20-30% with my test GPU load.
>>>
>>> No, that is time spent in MI_WAIT_FOR_EVENT.
>>
>> Could you add that info to the UI?
>>
>> E.g. just have "MI" on top of the "wait" column.
> 
> Like a full header strip? Yeah makes sense, I'll add it.
> 
>>  > I think not very used in current codebase.
>>
>> What you're using to validate that it reports correct value?
> 
> That would be igt/tests/perf_pmu/event-wait-rcs0.
> 
>>
>>>> HW specific test results
>>>> ------------------------
>>>>
>>>> BYT:
>>>> * Reports "Failed to initialize PMU!" although old intel_gpu_top
>>>>    works fine.
>>>>
>>>> HSW GT2,  BDW GT3, SKL GT2 and KBL GT3e seems to work fine except
>>>> for the "wait" value.
>>>>
>>>> I never saw blitter engine to do anything, but that's because
>>>> modesetting uses just 3D pipeline, and because I couldn't get
>>>> Intel DDX to work with rest of latest git version of X / 3D stack.
>>>
>>> Thank you for testing this so thoroughly - this was really invaluable 
>>> since I don't have access too such number of platforms. I've tried to 
>>> fix all this in the latest version.
>>
>> Machines are currently running tests, I'll check these tomorrow.
> 
> Thanks!
> 
>>
>>>> Kernel version support
>>>> ----------------------
>>>>
>>>> My HW specific testing above was with drm-tip kernel, but I did one 
>>>> test
>>>> also with Ubuntu 16.04 v4.4 kernel (which includes v4.6 or v4.8 i915 
>>>> backport) on KBL.  For that, the tool reported:
>>>> "Failed to detect engines!"
>>>>
>>>> Although the previous intel_gpu_top works fine with that kernel 
>>>> version.
>>>>
>>>> Same happens also with Ubuntu 17.04 v4.13 kernel.
>>>>
>>>>
>>>> -> If new version needs a certain kernel version, it should tell
>>>>     which version is required.
>>>
>>> Yep, at least 4.16 is needed so I have added this info to the error 
>>> message.
>>
>> IMHO the message is a bit ambivalent:
>>      Failed to detect engines! Kernel 4.16 or newer?
>>
>> I would suggest checking whether kernel is new enough, and if not:
>>      Kernel X.YY detected, 4.16 or newer required.
> 
> Maybe yeah. I was planning to improve error messages altogether but 
> forgot. Will see what improvements make sense.
> 
> Regards,
> 
> Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-04 12:15             ` [Intel-gfx] " Eero Tamminen
@ 2018-04-04 12:42               ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04 12:42 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


Hi,

On 04/04/2018 13:15, Eero Tamminen wrote:
> Hi,
> 
> I've now tested v5 with old Ubuntu kernel on KBL, and with latest
> drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.
> 
> 
> Generic test results
> --------------------
> 
> * Tool works on all of them
> 
> * The new error messages and headings look good
> 
> * Idle IMC read amounts correspond to expected values on SNB & HSW.
>    The much smaller values on BDW & SKL are due to FBC (how well
>    it compresses, naturally depends on screen content).

Hm OK, you managed to explain it. Because in the meantime I have 
observed one oddity with write bandwidth on my headless SkullCanyon NUC. 
It idles around 28MiB/s, while when I load it up with some command 
streamer activity it drops to ~11MiB/s. I don't know, but just feels 
suspect. (Read bandwidth goes from ~215MiB/s at idle to ~4.5GiB/s in my 
load case.)

> 
> 
> BYT & BSW
> ---------
> 
> * IMC, power usage and actual(?) freq values are missing.
> 
> -> You can get actual freq by polling CAGF register, represented by:
>      /sys/class/drm/card0/gt_act_freq_mhz

Yep, this is the i915 internal limitation that we cannot expose this for 
consumption from PMU.

> 
> Normally i915 driver maps uncore power usage to GPU power usage,
> but BYT is missing that (and ram power usage).  However, RAPL
> does report package & core values...
> 
> 
> Suggestions
> -----------
> 
> Maybe on platforms where RAPL doesn't report "uncore" power usage,
> you could just deduct RAPL reported "core" power consumption from
> the "package" power consumption, and report that as "GPU" power
> usage?  (Or do that in i915 directly)

What are you referring to as "uncore" in the context of RAPL?

Do I understood correctly you suggested to use "energy-pkg - 
energy-cores" when "energy-gpu" is not available? If the former two are 
there both on on BYT and BSW, this sounds okay to me.

> You need also to either update the manual, or implement -o and -e

There is a manual, will do!

> options for the new version of intel_gpu_top.  CSV output of all
> the reported values would be nice.

I would prefer to drop both -o and -e, since this is achievable via perf 
stat. For instance:

perf stat -a -e power/energy-gpu/,i915/rcs0-busy/ -I 1000 -x, <command>

Gives CSV samples once per second.

On the other hand one argument I can think of to actually do implement 
-o and -e, is that we need to do some extra normalization on some i915 
counters perf tool would not do.

I don't have a feeling if anyone is actually using these options. If 
unlikely, we should probably drop them regardless.

> You might mention in manual as an example how to calculate
> idle screen update bandwidth, and that it's impacted by:
> - PSR (panel self refresh, depends on display supporting it):
>    /sys/kernel/debug/dri/0/i915_edp_psr_status
> - FBC (frame buffer compression, enabled on newer GENs)
>    /sys/kernel/debug/dri/0/i915_fbc_status
> - end-to-end RBC (render buffer compression, requires modifiers
>    support i.e. GEN9+ GPU and X & Mesa with DRI3 v1.2 [1] support)

Sounds useful for users, but I am a bit wary of feature creep. In this 
specific example I'd want to push it for follow-up work.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [Intel-gfx] [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-04 12:42               ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04 12:42 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


Hi,

On 04/04/2018 13:15, Eero Tamminen wrote:
> Hi,
> 
> I've now tested v5 with old Ubuntu kernel on KBL, and with latest
> drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.
> 
> 
> Generic test results
> --------------------
> 
> * Tool works on all of them
> 
> * The new error messages and headings look good
> 
> * Idle IMC read amounts correspond to expected values on SNB & HSW.
>    The much smaller values on BDW & SKL are due to FBC (how well
>    it compresses, naturally depends on screen content).

Hm OK, you managed to explain it. Because in the meantime I have 
observed one oddity with write bandwidth on my headless SkullCanyon NUC. 
It idles around 28MiB/s, while when I load it up with some command 
streamer activity it drops to ~11MiB/s. I don't know, but just feels 
suspect. (Read bandwidth goes from ~215MiB/s at idle to ~4.5GiB/s in my 
load case.)

> 
> 
> BYT & BSW
> ---------
> 
> * IMC, power usage and actual(?) freq values are missing.
> 
> -> You can get actual freq by polling CAGF register, represented by:
>      /sys/class/drm/card0/gt_act_freq_mhz

Yep, this is the i915 internal limitation that we cannot expose this for 
consumption from PMU.

> 
> Normally i915 driver maps uncore power usage to GPU power usage,
> but BYT is missing that (and ram power usage).  However, RAPL
> does report package & core values...
> 
> 
> Suggestions
> -----------
> 
> Maybe on platforms where RAPL doesn't report "uncore" power usage,
> you could just deduct RAPL reported "core" power consumption from
> the "package" power consumption, and report that as "GPU" power
> usage?  (Or do that in i915 directly)

What are you referring to as "uncore" in the context of RAPL?

Do I understood correctly you suggested to use "energy-pkg - 
energy-cores" when "energy-gpu" is not available? If the former two are 
there both on on BYT and BSW, this sounds okay to me.

> You need also to either update the manual, or implement -o and -e

There is a manual, will do!

> options for the new version of intel_gpu_top.  CSV output of all
> the reported values would be nice.

I would prefer to drop both -o and -e, since this is achievable via perf 
stat. For instance:

perf stat -a -e power/energy-gpu/,i915/rcs0-busy/ -I 1000 -x, <command>

Gives CSV samples once per second.

On the other hand one argument I can think of to actually do implement 
-o and -e, is that we need to do some extra normalization on some i915 
counters perf tool would not do.

I don't have a feeling if anyone is actually using these options. If 
unlikely, we should probably drop them regardless.

> You might mention in manual as an example how to calculate
> idle screen update bandwidth, and that it's impacted by:
> - PSR (panel self refresh, depends on display supporting it):
>    /sys/kernel/debug/dri/0/i915_edp_psr_status
> - FBC (frame buffer compression, enabled on newer GENs)
>    /sys/kernel/debug/dri/0/i915_fbc_status
> - end-to-end RBC (render buffer compression, requires modifiers
>    support i.e. GEN9+ GPU and X & Mesa with DRI3 v1.2 [1] support)

Sounds useful for users, but I am a bit wary of feature creep. In this 
specific example I'd want to push it for follow-up work.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH i-g-t v5] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-04  9:48   ` [igt-dev] " Tvrtko Ursulin
  (?)
@ 2018-04-04 12:48   ` Rinat Ibragimov
  -1 siblings, 0 replies; 57+ messages in thread
From: Rinat Ibragimov @ 2018-04-04 12:48 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Eero Tamminen, igt-dev, Intel-gfx

>Среда,  4 апреля 2018, 12:48 +03:00 от Tvrtko Ursulin <tursulin@ursulin.net>:
>
>v5:

Tried this version on Skylake on Linux 4.16, and it works. Engines "busy" metric
behaves as expected. Render/3D goes up when something intensive happens on
a screen. Video goes up when I use VA-API-enabled video players, and increases
when I increase speed there. Blitter is typically at 0%, since I use "modesetting" driver.
When using "intel" driver instead, I can see Blitter metric changing too.
Haven't seen VideoEnhance to go over 0%, but that's probably due to the player not
using it at all. Power usage and frequency numbers are also having values I'd expect.

Output looks neat, and overall it works great. Thanks!


---
Rinat
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for intel-gpu-top: Rewrite the tool to be safe to use (rev4)
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (10 preceding siblings ...)
  (?)
@ 2018-04-04 13:27 ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-04-04 13:27 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use (rev4)
URL   : https://patchwork.freedesktop.org/series/40826/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
cad5fc06f954546042a432202cbe7e5a20fe1132 tests/gem_eio: Add reset and unwedge stress testing

with latest DRM-Tip kernel build CI_DRM_4020
4e6fa0d99f8f drm-tip: 2018y-04m-04d-12h-24m-54s UTC integration manifest

No testlist changes.

---- Known issues:

Test kms_frontbuffer_tracking:
        Subgroup basic:
                fail       -> PASS       (fi-cnl-y3) fdo#103167

fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:432s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:443s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:382s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:543s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:299s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:517s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:512s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:527s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:518s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:410s
fi-cfl-s3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:558s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:512s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:583s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:424s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:316s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:539s
fi-glk-j4005     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:487s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:406s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:422s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:471s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:433s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:473s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:464s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:510s
fi-pnv-d510      total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:676s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:439s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:534s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:510s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:504s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:432s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:444s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:600s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:399s
Blacklisted hosts:
fi-cnl-psr       total:285  pass:256  dwarn:3   dfail:0   fail:0   skip:26  time:522s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1222/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-04 12:42               ` [Intel-gfx] " Tvrtko Ursulin
@ 2018-04-04 14:23                 ` Eero Tamminen
  -1 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-04-04 14:23 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx

Hi,

On 04.04.2018 15:42, Tvrtko Ursulin wrote:
> On 04/04/2018 13:15, Eero Tamminen wrote:
>> I've now tested v5 with old Ubuntu kernel on KBL, and with latest
>> drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.
>>
>>
>> Generic test results
>> --------------------
>>
>> * Tool works on all of them
>>
>> * The new error messages and headings look good
>>
>> * Idle IMC read amounts correspond to expected values on SNB & HSW.
>>    The much smaller values on BDW & SKL are due to FBC (how well
>>    it compresses, naturally depends on screen content).

Unlike I assumed earlier, it's not actually uncore bandwidth.
It's RAM bandwidth, I guess the IMC abbreviation is for something
like Intel/Integrated Memory Controller.

Note that it includes also any memory bandwidth used by CPU, and if
the data fits into LLC, it doesn't show it.

However, knowing whether CPU uses memory bandwidth is actually useful
thing because RAM bandwidth is a shared resource with GPU.  One can
check other tasks bandwidth usage before launching the GPU task.


> Hm OK, you managed to explain it. Because in the meantime I have 
> observed one oddity with write bandwidth on my headless SkullCanyon NUC. 
> It idles around 28MiB/s,

On idle machine, write bandwidth usage should be zero.

What is causing the writes?


> while when I load it up with some command 
> streamer activity it drops to ~11MiB/s. I don't know, but just feels 
> suspect. (Read bandwidth goes from ~215MiB/s at idle to ~4.5GiB/s in my 
> load case.)

Is it possible that your test load directly affected whatever task
was causing the writes?  E.g. if the write load and the read load
e.g. both use render pipeline, your read load could slow down
the write load (by "flooding" render pipeline).

The effect could also be indirect. E.g. read bandwidth usage could eat
part of the write bandwidth, as they aren't completely independent
resources.

Or if your test load is very heavy, it could cause TDP limitation
for the whole device, which could drop other tasks a bit.

I would need to know more about what your write load is, to come up
with a good excuse. ;-)


	- Eero

>> BYT & BSW
>> ---------
>>
>> * IMC, power usage and actual(?) freq values are missing.
>>
>> -> You can get actual freq by polling CAGF register, represented by:
>>      /sys/class/drm/card0/gt_act_freq_mhz
> 
> Yep, this is the i915 internal limitation that we cannot expose this for 
> consumption from PMU.
> 
>>
>> Normally i915 driver maps uncore power usage to GPU power usage,
>> but BYT is missing that (and ram power usage).  However, RAPL
>> does report package & core values...
>>
>>
>> Suggestions
>> -----------
>>
>> Maybe on platforms where RAPL doesn't report "uncore" power usage,
>> you could just deduct RAPL reported "core" power consumption from
>> the "package" power consumption, and report that as "GPU" power
>> usage?  (Or do that in i915 directly)
> 
> What are you referring to as "uncore" in the context of RAPL?
> 
> Do I understood correctly you suggested to use "energy-pkg - 
> energy-cores" when "energy-gpu" is not available? If the former two are 
> there both on on BYT and BSW, this sounds okay to me.
> 
>> You need also to either update the manual, or implement -o and -e
> 
> There is a manual, will do!
> 
>> options for the new version of intel_gpu_top.  CSV output of all
>> the reported values would be nice.
> 
> I would prefer to drop both -o and -e, since this is achievable via perf 
> stat. For instance:
> 
> perf stat -a -e power/energy-gpu/,i915/rcs0-busy/ -I 1000 -x, <command>
> 
> Gives CSV samples once per second.
> 
> On the other hand one argument I can think of to actually do implement 
> -o and -e, is that we need to do some extra normalization on some i915 
> counters perf tool would not do.
> 
> I don't have a feeling if anyone is actually using these options. If 
> unlikely, we should probably drop them regardless.
> 
>> You might mention in manual as an example how to calculate
>> idle screen update bandwidth, and that it's impacted by:
>> - PSR (panel self refresh, depends on display supporting it):
>>    /sys/kernel/debug/dri/0/i915_edp_psr_status
>> - FBC (frame buffer compression, enabled on newer GENs)
>>    /sys/kernel/debug/dri/0/i915_fbc_status
>> - end-to-end RBC (render buffer compression, requires modifiers
>>    support i.e. GEN9+ GPU and X & Mesa with DRI3 v1.2 [1] support)
> 
> Sounds useful for users, but I am a bit wary of feature creep. In this 
> specific example I'd want to push it for follow-up work.
> 
> Regards,
> 
> Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-04 14:23                 ` Eero Tamminen
  0 siblings, 0 replies; 57+ messages in thread
From: Eero Tamminen @ 2018-04-04 14:23 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin

Hi,

On 04.04.2018 15:42, Tvrtko Ursulin wrote:
> On 04/04/2018 13:15, Eero Tamminen wrote:
>> I've now tested v5 with old Ubuntu kernel on KBL, and with latest
>> drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.
>>
>>
>> Generic test results
>> --------------------
>>
>> * Tool works on all of them
>>
>> * The new error messages and headings look good
>>
>> * Idle IMC read amounts correspond to expected values on SNB & HSW.
>>    The much smaller values on BDW & SKL are due to FBC (how well
>>    it compresses, naturally depends on screen content).

Unlike I assumed earlier, it's not actually uncore bandwidth.
It's RAM bandwidth, I guess the IMC abbreviation is for something
like Intel/Integrated Memory Controller.

Note that it includes also any memory bandwidth used by CPU, and if
the data fits into LLC, it doesn't show it.

However, knowing whether CPU uses memory bandwidth is actually useful
thing because RAM bandwidth is a shared resource with GPU.  One can
check other tasks bandwidth usage before launching the GPU task.


> Hm OK, you managed to explain it. Because in the meantime I have 
> observed one oddity with write bandwidth on my headless SkullCanyon NUC. 
> It idles around 28MiB/s,

On idle machine, write bandwidth usage should be zero.

What is causing the writes?


> while when I load it up with some command 
> streamer activity it drops to ~11MiB/s. I don't know, but just feels 
> suspect. (Read bandwidth goes from ~215MiB/s at idle to ~4.5GiB/s in my 
> load case.)

Is it possible that your test load directly affected whatever task
was causing the writes?  E.g. if the write load and the read load
e.g. both use render pipeline, your read load could slow down
the write load (by "flooding" render pipeline).

The effect could also be indirect. E.g. read bandwidth usage could eat
part of the write bandwidth, as they aren't completely independent
resources.

Or if your test load is very heavy, it could cause TDP limitation
for the whole device, which could drop other tasks a bit.

I would need to know more about what your write load is, to come up
with a good excuse. ;-)


	- Eero

>> BYT & BSW
>> ---------
>>
>> * IMC, power usage and actual(?) freq values are missing.
>>
>> -> You can get actual freq by polling CAGF register, represented by:
>>      /sys/class/drm/card0/gt_act_freq_mhz
> 
> Yep, this is the i915 internal limitation that we cannot expose this for 
> consumption from PMU.
> 
>>
>> Normally i915 driver maps uncore power usage to GPU power usage,
>> but BYT is missing that (and ram power usage).  However, RAPL
>> does report package & core values...
>>
>>
>> Suggestions
>> -----------
>>
>> Maybe on platforms where RAPL doesn't report "uncore" power usage,
>> you could just deduct RAPL reported "core" power consumption from
>> the "package" power consumption, and report that as "GPU" power
>> usage?  (Or do that in i915 directly)
> 
> What are you referring to as "uncore" in the context of RAPL?
> 
> Do I understood correctly you suggested to use "energy-pkg - 
> energy-cores" when "energy-gpu" is not available? If the former two are 
> there both on on BYT and BSW, this sounds okay to me.
> 
>> You need also to either update the manual, or implement -o and -e
> 
> There is a manual, will do!
> 
>> options for the new version of intel_gpu_top.  CSV output of all
>> the reported values would be nice.
> 
> I would prefer to drop both -o and -e, since this is achievable via perf 
> stat. For instance:
> 
> perf stat -a -e power/energy-gpu/,i915/rcs0-busy/ -I 1000 -x, <command>
> 
> Gives CSV samples once per second.
> 
> On the other hand one argument I can think of to actually do implement 
> -o and -e, is that we need to do some extra normalization on some i915 
> counters perf tool would not do.
> 
> I don't have a feeling if anyone is actually using these options. If 
> unlikely, we should probably drop them regardless.
> 
>> You might mention in manual as an example how to calculate
>> idle screen update bandwidth, and that it's impacted by:
>> - PSR (panel self refresh, depends on display supporting it):
>>    /sys/kernel/debug/dri/0/i915_edp_psr_status
>> - FBC (frame buffer compression, enabled on newer GENs)
>>    /sys/kernel/debug/dri/0/i915_fbc_status
>> - end-to-end RBC (render buffer compression, requires modifiers
>>    support i.e. GEN9+ GPU and X & Mesa with DRI3 v1.2 [1] support)
> 
> Sounds useful for users, but I am a bit wary of feature creep. In this 
> specific example I'd want to push it for follow-up work.
> 
> Regards,
> 
> Tvrtko

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-04 14:23                 ` Eero Tamminen
@ 2018-04-04 15:24                   ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04 15:24 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx


On 04/04/2018 15:23, Eero Tamminen wrote:
> Hi,
> 
> On 04.04.2018 15:42, Tvrtko Ursulin wrote:
>> On 04/04/2018 13:15, Eero Tamminen wrote:
>>> I've now tested v5 with old Ubuntu kernel on KBL, and with latest
>>> drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.
>>>
>>>
>>> Generic test results
>>> --------------------
>>>
>>> * Tool works on all of them
>>>
>>> * The new error messages and headings look good
>>>
>>> * Idle IMC read amounts correspond to expected values on SNB & HSW.
>>>    The much smaller values on BDW & SKL are due to FBC (how well
>>>    it compresses, naturally depends on screen content).
> 
> Unlike I assumed earlier, it's not actually uncore bandwidth.
> It's RAM bandwidth, I guess the IMC abbreviation is for something
> like Intel/Integrated Memory Controller.
> 
> Note that it includes also any memory bandwidth used by CPU, and if
> the data fits into LLC, it doesn't show it.
> 
> However, knowing whether CPU uses memory bandwidth is actually useful
> thing because RAM bandwidth is a shared resource with GPU.  One can
> check other tasks bandwidth usage before launching the GPU task.
> 
> 
>> Hm OK, you managed to explain it. Because in the meantime I have 
>> observed one oddity with write bandwidth on my headless SkullCanyon 
>> NUC. It idles around 28MiB/s,
> 
> On idle machine, write bandwidth usage should be zero.
> 
> What is causing the writes?

It was probably caused by some of the kernel debug options I had set. Or 
maybe disabled dynticks. With fewer debug options and dynticks I cannot 
reproduce that any more.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-04 15:24                   ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04 15:24 UTC (permalink / raw)
  To: Eero Tamminen, Tvrtko Ursulin, igt-dev; +Cc: Intel-gfx, Tvrtko Ursulin


On 04/04/2018 15:23, Eero Tamminen wrote:
> Hi,
> 
> On 04.04.2018 15:42, Tvrtko Ursulin wrote:
>> On 04/04/2018 13:15, Eero Tamminen wrote:
>>> I've now tested v5 with old Ubuntu kernel on KBL, and with latest
>>> drm-tip kernel on SNB, HSW, BYT, BSW and BDW GT & GT3.
>>>
>>>
>>> Generic test results
>>> --------------------
>>>
>>> * Tool works on all of them
>>>
>>> * The new error messages and headings look good
>>>
>>> * Idle IMC read amounts correspond to expected values on SNB & HSW.
>>>    The much smaller values on BDW & SKL are due to FBC (how well
>>>    it compresses, naturally depends on screen content).
> 
> Unlike I assumed earlier, it's not actually uncore bandwidth.
> It's RAM bandwidth, I guess the IMC abbreviation is for something
> like Intel/Integrated Memory Controller.
> 
> Note that it includes also any memory bandwidth used by CPU, and if
> the data fits into LLC, it doesn't show it.
> 
> However, knowing whether CPU uses memory bandwidth is actually useful
> thing because RAM bandwidth is a shared resource with GPU.  One can
> check other tasks bandwidth usage before launching the GPU task.
> 
> 
>> Hm OK, you managed to explain it. Because in the meantime I have 
>> observed one oddity with write bandwidth on my headless SkullCanyon 
>> NUC. It idles around 28MiB/s,
> 
> On idle machine, write bandwidth usage should be zero.
> 
> What is causing the writes?

It was probably caused by some of the kernel debug options I had set. Or 
maybe disabled dynticks. With fewer debug options and dynticks I cannot 
reproduce that any more.

Regards,

Tvrtko
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH i-g-t v6] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-04  9:48   ` [igt-dev] " Tvrtko Ursulin
@ 2018-04-04 15:26     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04 15:26 UTC (permalink / raw)
  To: igt-dev; +Cc: Rinat Ibragimov, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

v4:
 * Really support 4.16 by skipping of missing engines.
 * Simpler and less hacky float printing.
 * Preserve copyright header. (Antonio Argenziano)
 * Simplify engines_ptr macro. (Rinat Ibragimov)

v5:
 * Get RAPL unit from sysfs.
 * Consolidate sysfs paths with a macro.
 * Tidy error handling by carrying over and reporting errno.
 * Check against console height on all prints.
 * More readable minimum kernel version message. (Eero Tamminen)
 * Column banner for per engine stats. (Eero Tamminen)

v6:
 * Man page update. (Eero Tamminen)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 man/intel_gpu_top.rst |   41 +-
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
 tools/meson.build     |    6 +-
 6 files changed, 719 insertions(+), 587 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/man/intel_gpu_top.rst b/man/intel_gpu_top.rst
index a5f7175bb1a0..19c712307d28 100644
--- a/man/intel_gpu_top.rst
+++ b/man/intel_gpu_top.rst
@@ -7,9 +7,9 @@ Display a top-like summary of Intel GPU usage
 ---------------------------------------------
 .. include:: defs.rst
 :Author: IGT Developers <igt-dev@lists.freedesktop.org>
-:Date: 2016-03-01
+:Date: 2018-04-04
 :Version: |PACKAGE_STRING|
-:Copyright: 2009,2011,2012,2016 Intel Corporation
+:Copyright: 2009,2011,2012,2016,2018 Intel Corporation
 :Manual section: |MANUAL_SECTION|
 :Manual group: |MANUAL_GROUP|
 
@@ -21,42 +21,25 @@ SYNOPSIS
 DESCRIPTION
 ===========
 
-**intel_gpu_top** is a tool to display usage information of an Intel GPU. It
-requires root privilege to map the graphics device.
+**intel_gpu_top** is a tool to display usage information on Intel GPU's.
+
+The tool gathers data using perf performance counters (PMU) exposed by i915 and other platform drivers like RAPL (power) and Uncore IMC (memory bandwidth).
 
 OPTIONS
 =======
 
--s SAMPLES
-    Number of samples to acquire per second.
-
--o FILE
-    Collect usage statistics to FILE. If file is "-", run non-interactively
-    and output statistics to stdout.
-
--e COMMAND
-    Execute COMMAND to profile, and leave when it is finished. Note that the
-    entire command with all parameters should be included as one parameter.
+-s <ms>
+    Refresh period in milliseconds.
 
 -h
-    Show usage notes.
+    Show help text.
 
-EXAMPLES
-========
-
-intel_gpu_top -o "cairo-trace-gvim.log" -s 100 -e "cairo-perf-trace /tmp/gvim"
-    Run cairo-perf-trace with /tmp/gvim trace, non-interactively, saving the
-    statistics into cairo-trace-gvim.log file, and collecting 100 samples per
-    second.
-
-Note that idle units are not displayed, so an entirely idle GPU will only
-display the ring status and header.
+LIMITATIONS
+===========
 
-BUGS
-====
+* Not all metrics are supported on all platforms. Where a metric is unsupported it's value will be replaced by a dashed line.
 
-Some GPUs report some units as busy when they aren't, such that even when idle
-and not hung, it will show up as 100% busy.
+* Non-root access to perf counters is controlled by the *perf_event_paranoid* sysctl.
 
 REPORTING BUGS
 ==============
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..b923c3cfbe97 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2007-2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,695 +23,832 @@
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	unsigned int num_counters;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+	const char *rapl_unit;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) (&engines->engine + (n))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = ENAMETOOLONG;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = errno;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = ENOENT;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = ENOBUFS;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = errno;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = errno;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret) {
+		free(engines);
+		errno = ret;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		return NULL;
 	}
 
+	qsort(engine_ptr(engines, 0), engines->num_engines,
+	      sizeof(struct engine), engine_cmp);
+
+	engines->root = d;
 
-	printf("\n");
-	return -1;
+	return engines;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd, err;
+	ssize_t ret;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	err = errno;
+	close(fd);
+	if (ret < 1) {
+		errno = ret < 0 ? err : ENOMSG;
+
+		return -1;
+	}
+
+	if (ret > 1 && buf[ret - 1] == '\n')
+		buf[ret - 1] = '\0';
+	else
+		buf[ret] = '\0';
+
+	return 0;
+}
+
+static uint64_t filename_to_u64(const char *filename, int base)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char buf[64], *b;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
+}
+
+#define RAPL_ROOT "/sys/devices/power/"
+#define RAPL_EVENT "/sys/devices/power/events/"
+
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64(RAPL_ROOT "type", 10);
+}
+
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
+}
+
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double(RAPL_EVENT "energy-gpu.scale");
+}
+
+static const char *rapl_gpu_power_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
+			    buf, sizeof(buf)) == 0)
+		if (!strcmp(buf, "Joules"))
+			return strdup("Watts");
+		else
+			return strdup(buf);
+	else
+		return NULL;
+}
+
+#define IMC_ROOT "/sys/devices/uncore_imc/"
+#define IMC_EVENT "/sys/devices/uncore_imc/events/"
+
+static uint64_t imc_type_id(void)
+{
+	return filename_to_u64(IMC_ROOT "type", 10);
+}
+
+static uint64_t imc_data_reads(void)
+{
+	return filename_to_u64(IMC_EVENT "data_reads", 0);
+}
+
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_reads.scale");
+}
+
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64(IMC_EVENT "data_writes", 0);
+}
+
+static double imc_data_writes_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_writes.scale");
+}
+
+static const char *imc_data_writes_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd >= 0)
+				engine->num_counters++;
+		}
 	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		engines->rapl_unit = rapl_gpu_power_unit();
+		if (!engines->rapl_unit)
+			return -1;
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
 	}
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
-}
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
+
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	return INREG(ring->mmio + reg);
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_init(struct ring *ring)
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
 }
 
-static void ring_reset(struct ring *ring)
+static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
 {
-	ring->idle = ring->full = 0;
+	unsigned int i;
+
+	for (i = 0; i < num && i < (bufsz - 1); i++)
+		*buf++ = c;
+
+	*buf = 0;
 }
 
-static void ring_sample(struct ring *ring)
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz,
+		     unsigned int width, unsigned width_dec,
+		     double d, double t, double s)
 {
-	int full;
+	double val;
+	int len;
+
+	assert(bufsz >= (width + width_dec + 1));
+
+	if (!cnt->present) {
+		fill_str(buf, bufsz, '-', width + width_dec);
+		return;
+	}
 
-	if (!ring->size)
+	val = __pmu_calc(&cnt->val, d, t, s);
+
+	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
+	if (len < 0 || len == bufsz) {
+		fill_str(buf, bufsz, 'X', width + width_dec);
 		return;
+	}
+}
+
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
+{
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (ts)
+		*ts = data[1];
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+	return data[0];
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static uint64_t pmu_read_single(int fd)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return __pmu_read_single(fd, NULL);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
 {
-	int percent_busy, len;
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
 
-	if (!ring->size)
-		return;
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n\n"
+		"\t[-s <ms>]       Refresh period in milliseconds (default %ums).\n"
+		"\t[-h]            Show this help text.\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
+			strerror(errno));
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! (%s)\n", strerror(errno));
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
+		struct winsize ws;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
+		}
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_writes_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		if (lines++ < con_h)
+			printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
+			       fact, freq, rc6, power, engines->rapl_unit, irq);
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		if (lines++ < con_h)
+			printf("\n");
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		if (engines->imc_fd) {
+			if (lines++ < con_h)
+				printf("      IMC reads:   %s %s/s\n",
+				       reads, engines->imc_reads_unit);
+
+			if (lines++ < con_h)
+				printf("     IMC writes:   %s %s/s\n",
+				       writes, engines->imc_writes_unit);
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
+			if (++lines < con_h)
 				printf("\n");
-			}
 		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
 
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+		for (i = 0; i < engines->num_engines; i++) {
+			struct engine *engine = engine_ptr(engines, i);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			if (engine->num_counters && lines < con_h) {
+				const char *a = "          ENGINE      BUSY ";
+				const char *b = " MI_SEMA MI_WAIT";
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
+				printf("\033[7m%s%*s%s\033[0m\n",
+				       a,
+				       (int)(con_w - 1 - strlen(a) - strlen(b)),
+				       " ", b);
+				lines++;
+				break;
+			}
 		}
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			if (!engine->num_counters)
 				continue;
-			if (WIFEXITED(child_stat))
-				break;
+
+			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
+				       sema, wait);
+
+			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
+
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
+
+			printf("%s\n", buf);
+
+			lines++;
 		}
-	}
 
-	fclose(output);
+		if (lines++ < con_h)
+			printf("\n");
+
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [igt-dev] [PATCH i-g-t v6] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-04 15:26     ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-04 15:26 UTC (permalink / raw)
  To: igt-dev; +Cc: Tvrtko Ursulin, Eero Tamminen, Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
register access. This patch rewrites it to use only PMU.

Only overall command streamer busyness and GPU global data such as power
and frequencies are included in this new version.

For access to more GPU functional unit level data, an OA metric based tool
like gpu-top should be used instead.

v2:
 * Sort engines by class and instance.
 * Do not wait for one sampling period to display something on screen.
 * Move code out of the asserts. (Rinat Ibragimov)
 * Continuously adapt to terminal size. (Rinat Ibragimov)

v3:
 * Change layout and precision of some field. (Chris Wilson)
 Eero Tamminen:
 * Use more user friendly engine names.
 * Don't error out if a counter is missing.
 * Add IMC read/write bandwidth.
 * Report minimum required kernel version.

v4:
 * Really support 4.16 by skipping of missing engines.
 * Simpler and less hacky float printing.
 * Preserve copyright header. (Antonio Argenziano)
 * Simplify engines_ptr macro. (Rinat Ibragimov)

v5:
 * Get RAPL unit from sysfs.
 * Consolidate sysfs paths with a macro.
 * Tidy error handling by carrying over and reporting errno.
 * Check against console height on all prints.
 * More readable minimum kernel version message. (Eero Tamminen)
 * Column banner for per engine stats. (Eero Tamminen)

v6:
 * Man page update. (Eero Tamminen)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Petri Latvala <petri.latvala@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
---
 lib/igt_perf.c        |    6 +
 lib/igt_perf.h        |    1 +
 man/intel_gpu_top.rst |   41 +-
 tools/Makefile.am     |    2 +
 tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
 tools/meson.build     |    6 +-
 6 files changed, 719 insertions(+), 587 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 99d82ea51c9b..e3dec2cc29c7 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
 	return _perf_open(type, config, -1,
 			  PERF_FORMAT_TOTAL_TIME_ENABLED);
 }
+
+int igt_perf_open_group(uint64_t type, uint64_t config, int group)
+{
+	return _perf_open(type, config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 614ea5d23fa6..e00718f4769a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
 int perf_i915_open(uint64_t config);
 int perf_i915_open_group(uint64_t config, int group);
 int igt_perf_open(uint64_t type, uint64_t config);
+int igt_perf_open_group(uint64_t type, uint64_t config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/man/intel_gpu_top.rst b/man/intel_gpu_top.rst
index a5f7175bb1a0..19c712307d28 100644
--- a/man/intel_gpu_top.rst
+++ b/man/intel_gpu_top.rst
@@ -7,9 +7,9 @@ Display a top-like summary of Intel GPU usage
 ---------------------------------------------
 .. include:: defs.rst
 :Author: IGT Developers <igt-dev@lists.freedesktop.org>
-:Date: 2016-03-01
+:Date: 2018-04-04
 :Version: |PACKAGE_STRING|
-:Copyright: 2009,2011,2012,2016 Intel Corporation
+:Copyright: 2009,2011,2012,2016,2018 Intel Corporation
 :Manual section: |MANUAL_SECTION|
 :Manual group: |MANUAL_GROUP|
 
@@ -21,42 +21,25 @@ SYNOPSIS
 DESCRIPTION
 ===========
 
-**intel_gpu_top** is a tool to display usage information of an Intel GPU. It
-requires root privilege to map the graphics device.
+**intel_gpu_top** is a tool to display usage information on Intel GPU's.
+
+The tool gathers data using perf performance counters (PMU) exposed by i915 and other platform drivers like RAPL (power) and Uncore IMC (memory bandwidth).
 
 OPTIONS
 =======
 
--s SAMPLES
-    Number of samples to acquire per second.
-
--o FILE
-    Collect usage statistics to FILE. If file is "-", run non-interactively
-    and output statistics to stdout.
-
--e COMMAND
-    Execute COMMAND to profile, and leave when it is finished. Note that the
-    entire command with all parameters should be included as one parameter.
+-s <ms>
+    Refresh period in milliseconds.
 
 -h
-    Show usage notes.
+    Show help text.
 
-EXAMPLES
-========
-
-intel_gpu_top -o "cairo-trace-gvim.log" -s 100 -e "cairo-perf-trace /tmp/gvim"
-    Run cairo-perf-trace with /tmp/gvim trace, non-interactively, saving the
-    statistics into cairo-trace-gvim.log file, and collecting 100 samples per
-    second.
-
-Note that idle units are not displayed, so an entirely idle GPU will only
-display the ring status and header.
+LIMITATIONS
+===========
 
-BUGS
-====
+* Not all metrics are supported on all platforms. Where a metric is unsupported it's value will be replaced by a dashed line.
 
-Some GPUs report some units as busy when they aren't, such that even when idle
-and not hung, it will show up as 100% busy.
+* Non-root access to perf counters is controlled by the *perf_event_paranoid* sysctl.
 
 REPORTING BUGS
 ==============
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 09b6dbcc3ece..a0b016ddd7ff 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
 intel_aubdump_la_SOURCES = aubdump.c
 intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
 
+intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
+
 bin_SCRIPTS = intel_aubdump
 CLEANFILES = $(bin_SCRIPTS)
 
diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
index 098e6ce3ff86..b923c3cfbe97 100644
--- a/tools/intel_gpu_top.c
+++ b/tools/intel_gpu_top.c
@@ -1,6 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
- * Copyright © 2011 Intel Corporation
+ * Copyright © 2007-2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -24,695 +23,832 @@
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *    Eugeni Dodonov <eugeni.dodonov@intel.com>
- *
  */
 
-#include "config.h"
-
-#include <inttypes.h>
-#include <unistd.h>
-#include <stdlib.h>
 #include <stdio.h>
-#include <err.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <sys/wait.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdint.h>
+#include <assert.h>
 #include <string.h>
-#ifdef HAVE_TERMIOS_H
-#include <termios.h>
-#endif
-#include "intel_io.h"
-#include "instdone.h"
-#include "intel_reg.h"
-#include "intel_chipset.h"
-#include "drmtest.h"
-
-#define  FORCEWAKE	    0xA18C
-#define  FORCEWAKE_ACK	    0x130090
-
-#define SAMPLES_PER_SEC             10000
-#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
-
-#define MAX_NUM_TOP_BITS            100
-
-#define HAS_STATS_REGS(devid)		IS_965(devid)
-
-struct top_bit {
-	struct instdone_bit *bit;
-	int count;
-} top_bits[MAX_NUM_TOP_BITS];
-struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
-
-static uint32_t instdone, instdone1;
-
-static const char *bars[] = {
-	" ",
-	"▏",
-	"▎",
-	"▍",
-	"▌",
-	"▋",
-	"▊",
-	"▉",
-	"█"
-};
+#include <ctype.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <math.h>
+#include <locale.h>
+
+#include "igt_perf.h"
 
-enum stats_counts {
-	IA_VERTICES,
-	IA_PRIMITIVES,
-	VS_INVOCATION,
-	GS_INVOCATION,
-	GS_PRIMITIVES,
-	CL_INVOCATION,
-	CL_PRIMITIVES,
-	PS_INVOCATION,
-	PS_DEPTH,
-	STATS_COUNT
+struct pmu_pair {
+	uint64_t cur;
+	uint64_t prev;
 };
 
-const uint32_t stats_regs[STATS_COUNT] = {
-	IA_VERTICES_COUNT_QW,
-	IA_PRIMITIVES_COUNT_QW,
-	VS_INVOCATION_COUNT_QW,
-	GS_INVOCATION_COUNT_QW,
-	GS_PRIMITIVES_COUNT_QW,
-	CL_INVOCATION_COUNT_QW,
-	CL_PRIMITIVES_COUNT_QW,
-	PS_INVOCATION_COUNT_QW,
-	PS_DEPTH_COUNT_QW,
+struct pmu_counter {
+	bool present;
+	uint64_t config;
+	unsigned int idx;
+	struct pmu_pair val;
 };
 
-const char *stats_reg_names[STATS_COUNT] = {
-	"vert fetch",
-	"prim fetch",
-	"VS invocations",
-	"GS invocations",
-	"GS prims",
-	"CL invocations",
-	"CL prims",
-	"PS invocations",
-	"PS depth pass",
+struct engine {
+	const char *name;
+	const char *display_name;
+
+	unsigned int class;
+	unsigned int instance;
+
+	unsigned int num_counters;
+
+	struct pmu_counter busy;
+	struct pmu_counter wait;
+	struct pmu_counter sema;
 };
 
-uint64_t stats[STATS_COUNT];
-uint64_t last_stats[STATS_COUNT];
+struct engines {
+	unsigned int num_engines;
+	unsigned int num_counters;
+	DIR *root;
+	int fd;
+	struct pmu_pair ts;
+
+	int rapl_fd;
+	double rapl_scale;
+	const char *rapl_unit;
+
+	int imc_fd;
+	double imc_reads_scale;
+	const char *imc_reads_unit;
+	double imc_writes_scale;
+	const char *imc_writes_unit;
+
+	struct pmu_counter freq_req;
+	struct pmu_counter freq_act;
+	struct pmu_counter irq;
+	struct pmu_counter rc6;
+	struct pmu_counter rapl;
+	struct pmu_counter imc_reads;
+	struct pmu_counter imc_writes;
+
+	struct engine engine;
+};
 
-static unsigned long
-gettime(void)
+static uint64_t
+get_pmu_config(int dirfd, const char *name, const char *counter)
 {
-    struct timeval t;
-    gettimeofday(&t, NULL);
-    return (t.tv_usec + (t.tv_sec * 1000000));
-}
+	char buf[128], *p;
+	int fd, ret;
 
-static int
-top_bits_sort(const void *a, const void *b)
-{
-	struct top_bit * const *bit_a = a;
-	struct top_bit * const *bit_b = b;
-	int a_count = (*bit_a)->count;
-	int b_count = (*bit_b)->count;
+	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
+	if (ret < 0 || ret == sizeof(buf))
+		return -1;
 
-	if (a_count < b_count)
-		return 1;
-	else if (a_count == b_count)
-		return 0;
-	else
+	fd = openat(dirfd, buf, O_RDONLY);
+	if (fd < 0)
 		return -1;
-}
 
-static void
-update_idle_bit(struct top_bit *top_bit)
-{
-	uint32_t reg_val;
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	if (ret <= 0)
+		return -1;
 
-	if (top_bit->bit->reg == INSTDONE_1)
-		reg_val = instdone1;
-	else
-		reg_val = instdone;
+	p = index(buf, '0');
+	if (!p)
+		return -1;
 
-	if ((reg_val & top_bit->bit->bit) == 0)
-		top_bit->count++;
+	return strtoul(p, NULL, 0);
 }
 
-static void
-print_clock(const char *name, int clock) {
-	if (clock == -1)
-		printf("%s clock: unknown", name);
+#define engine_ptr(engines, n) (&engines->engine + (n))
+
+static const char *class_display_name(unsigned int class)
+{
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return "Render/3D";
+	case I915_ENGINE_CLASS_COPY:
+		return "Blitter";
+	case I915_ENGINE_CLASS_VIDEO:
+		return "Video";
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return "VideoEnhance";
+	default:
+		return "[unknown]";
+	}
+}
+
+static int engine_cmp(const void *__a, const void *__b)
+{
+	const struct engine *a = (struct engine *)__a;
+	const struct engine *b = (struct engine *)__b;
+
+	if (a->class != b->class)
+		return a->class - b->class;
 	else
-		printf("%s clock: %d Mhz", name, clock);
+		return a->instance - b->instance;
 }
 
-static int
-print_clock_info(struct pci_device *pci_dev)
+static struct engines *discover_engines(void)
 {
-	uint32_t devid = pci_dev->device_id;
-	uint16_t gcfgc;
+	const char *sysfs_root = "/sys/devices/i915/events";
+	struct engines *engines;
+	struct dirent *dent;
+	int ret = 0;
+	DIR *d;
 
-	if (IS_GM45(devid)) {
-		int core_clock = -1;
+	engines = malloc(sizeof(struct engines));
+	if (!engines)
+		return NULL;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	memset(engines, 0, sizeof(*engines));
 
-		switch (gcfgc & 0xf) {
-		case 8:
-			core_clock = 266;
-			break;
-		case 9:
-			core_clock = 320;
-			break;
-		case 11:
-			core_clock = 400;
-			break;
-		case 13:
-			core_clock = 533;
-			break;
-		}
-		print_clock("core", core_clock);
-	} else if (IS_965(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, sampler_clock = -1;
+	engines->num_engines = 0;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+	d = opendir(sysfs_root);
+	if (!d)
+		return NULL;
 
-		switch (gcfgc & 0xf) {
-		case 2:
-			render_clock = 250; sampler_clock = 267;
-			break;
-		case 3:
-			render_clock = 320; sampler_clock = 333;
-			break;
-		case 4:
-			render_clock = 400; sampler_clock = 444;
-			break;
-		case 5:
-			render_clock = 500; sampler_clock = 533;
+	while ((dent = readdir(d)) != NULL) {
+		const char *endswith = "-busy";
+		const unsigned int endlen = strlen(endswith);
+		struct engine *engine =
+				engine_ptr(engines, engines->num_engines);
+		char buf[256];
+
+		if (dent->d_type != DT_REG)
+			continue;
+
+		if (strlen(dent->d_name) >= sizeof(buf)) {
+			ret = ENAMETOOLONG;
 			break;
 		}
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("sampler", sampler_clock);
-	} else if (IS_945(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		strcpy(buf, dent->d_name);
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		/* xxxN-busy */
+		if (strlen(buf) < (endlen + 4))
+			continue;
+		if (strcmp(&buf[strlen(buf) - endlen], endswith))
+			continue;
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 166;
-			break;
-		case 1:
-			render_clock = 200;
-			break;
-		case 3:
-			render_clock = 250;
-			break;
-		case 5:
-			render_clock = 400;
+		memset(engine, 0, sizeof(*engine));
+
+		buf[strlen(buf) - endlen] = 0;
+		engine->name = strdup(buf);
+		if (!engine->name) {
+			ret = errno;
 			break;
 		}
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 200;
-			break;
-		case 4:
-			display_clock = 320;
+		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
+						     "busy");
+		if (engine->busy.config == -1) {
+			ret = ENOENT;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
-	} else if (IS_915(devid) && IS_MOBILE(devid)) {
-		int render_clock = -1, display_clock = -1;
+		engine->class = (engine->busy.config &
+				 (__I915_PMU_OTHER(0) - 1)) >>
+				I915_PMU_CLASS_SHIFT;
 
-		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
+		engine->instance = (engine->busy.config >>
+				    I915_PMU_SAMPLE_BITS) &
+				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
 
-		switch (gcfgc & 0x7) {
-		case 0:
-			render_clock = 160;
-			break;
-		case 1:
-			render_clock = 190;
-			break;
-		case 4:
-			render_clock = 333;
+		ret = snprintf(buf, sizeof(buf), "%s/%u",
+			       class_display_name(engine->class),
+			       engine->instance);
+		if (ret < 0 || ret == sizeof(buf)) {
+			ret = ENOBUFS;
 			break;
 		}
-		if (gcfgc & (1 << 13))
-		    render_clock = 133;
+		ret = 0;
 
-		switch (gcfgc & 0x70) {
-		case 0:
-			display_clock = 190;
+		engine->display_name = strdup(buf);
+		if (!engine->display_name) {
+			ret = errno;
 			break;
-		case 4:
-			display_clock = 333;
+		}
+
+		engines->num_engines++;
+		engines = realloc(engines, sizeof(struct engines) +
+				  engines->num_engines * sizeof(struct engine));
+		if (!engines) {
+			ret = errno;
 			break;
 		}
-		if (gcfgc & (1 << 7))
-		    display_clock = 133;
+	}
+
+	if (ret) {
+		free(engines);
+		errno = ret;
 
-		print_clock("render", render_clock);
-		printf("  ");
-		print_clock("display", display_clock);
+		return NULL;
 	}
 
+	qsort(engine_ptr(engines, 0), engines->num_engines,
+	      sizeof(struct engine), engine_cmp);
+
+	engines->root = d;
 
-	printf("\n");
-	return -1;
+	return engines;
 }
 
-#define STATS_LEN (20)
-#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
+static int
+filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
+{
+	int fd, err;
+	ssize_t ret;
 
-static void
-print_percentage_bar(float percent, int cur_line_len)
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, buf, bufsize - 1);
+	err = errno;
+	close(fd);
+	if (ret < 1) {
+		errno = ret < 0 ? err : ENOMSG;
+
+		return -1;
+	}
+
+	if (ret > 1 && buf[ret - 1] == '\n')
+		buf[ret - 1] = '\0';
+	else
+		buf[ret] = '\0';
+
+	return 0;
+}
+
+static uint64_t filename_to_u64(const char *filename, int base)
 {
-	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
-	int bar_len = bar_avail_len * (percent + .5) / 100.0;
-	int i;
+	char buf[64], *b;
 
-	for (i = bar_len; i >= 8; i -= 8) {
-		printf("%s", bars[8]);
-		cur_line_len++;
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	/*
+	 * Handle both single integer and key=value formats by skipping
+	 * leading non-digits.
+	 */
+	b = buf;
+	while (*b && !isdigit(*b))
+		b++;
+
+	return strtoull(b, NULL, base);
+}
+
+static double filename_to_double(const char *filename)
+{
+	char *oldlocale;
+	char buf[80];
+	double v;
+
+	if (filename_to_buf(filename, buf, sizeof(buf)))
+		return 0;
+
+	oldlocale = setlocale(LC_ALL, "C");
+	v = strtod(buf, NULL);
+	setlocale(LC_ALL, oldlocale);
+
+	return v;
+}
+
+#define RAPL_ROOT "/sys/devices/power/"
+#define RAPL_EVENT "/sys/devices/power/events/"
+
+static uint64_t rapl_type_id(void)
+{
+	return filename_to_u64(RAPL_ROOT "type", 10);
+}
+
+static uint64_t rapl_gpu_power(void)
+{
+	return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
+}
+
+static double rapl_gpu_power_scale(void)
+{
+	return filename_to_double(RAPL_EVENT "energy-gpu.scale");
+}
+
+static const char *rapl_gpu_power_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
+			    buf, sizeof(buf)) == 0)
+		if (!strcmp(buf, "Joules"))
+			return strdup("Watts");
+		else
+			return strdup(buf);
+	else
+		return NULL;
+}
+
+#define IMC_ROOT "/sys/devices/uncore_imc/"
+#define IMC_EVENT "/sys/devices/uncore_imc/events/"
+
+static uint64_t imc_type_id(void)
+{
+	return filename_to_u64(IMC_ROOT "type", 10);
+}
+
+static uint64_t imc_data_reads(void)
+{
+	return filename_to_u64(IMC_EVENT "data_reads", 0);
+}
+
+static double imc_data_reads_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_reads.scale");
+}
+
+static const char *imc_data_reads_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+static uint64_t imc_data_writes(void)
+{
+	return filename_to_u64(IMC_EVENT "data_writes", 0);
+}
+
+static double imc_data_writes_scale(void)
+{
+	return filename_to_double(IMC_EVENT "data_writes.scale");
+}
+
+static const char *imc_data_writes_unit(void)
+{
+	char buf[32];
+
+	if (filename_to_buf(IMC_EVENT "data_writes.unit",
+			    buf, sizeof(buf)) == 0)
+		return strdup(buf);
+	else
+		return NULL;
+}
+
+#define _open_pmu(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+#define _open_imc(cnt, pmu, fd) \
+({ \
+	int fd__; \
+\
+	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
+	if (fd__ >= 0) { \
+		if ((fd) == -1) \
+			(fd) = fd__; \
+		(pmu)->present = true; \
+		(pmu)->idx = (cnt)++; \
+	} \
+\
+	fd__; \
+})
+
+static int pmu_init(struct engines *engines)
+{
+	unsigned int i;
+	int fd;
+
+	engines->fd = -1;
+	engines->num_counters = 0;
+
+	engines->irq.config = I915_PMU_INTERRUPTS;
+	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
+	if (fd < 0)
+		return -1;
+
+	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
+
+	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
+	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
+
+	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
+	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+		struct {
+			struct pmu_counter *pmu;
+			const char *counter;
+		} *cnt, counters[] = {
+			{ .pmu = &engine->busy, .counter = "busy" },
+			{ .pmu = &engine->wait, .counter = "wait" },
+			{ .pmu = &engine->sema, .counter = "sema" },
+			{ .pmu = NULL, .counter = NULL },
+		};
+
+		for (cnt = counters; cnt->pmu; cnt++) {
+			if (!cnt->pmu->config)
+				cnt->pmu->config =
+					get_pmu_config(dirfd(engines->root),
+						       engine->name,
+						       cnt->counter);
+			fd = _open_pmu(engines->num_counters, cnt->pmu,
+				       engines->fd);
+			if (fd >= 0)
+				engine->num_counters++;
+		}
 	}
-	if (i) {
-		printf("%s", bars[i]);
-		cur_line_len++;
+
+	engines->rapl_fd = -1;
+	if (rapl_type_id()) {
+		engines->rapl_scale = rapl_gpu_power_scale();
+		engines->rapl_unit = rapl_gpu_power_unit();
+		if (!engines->rapl_unit)
+			return -1;
+
+		engines->rapl.config = rapl_gpu_power();
+		if (!engines->rapl.config)
+			return -1;
+
+		engines->rapl_fd = igt_perf_open(rapl_type_id(),
+						 engines->rapl.config);
+		if (engines->rapl_fd < 0)
+			return -1;
+
+		engines->rapl.present = true;
 	}
 
-	/* NB: We can't use a field width with utf8 so we manually
-	* guarantee a field with of 45 chars for any bar. */
-	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
-}
+	engines->imc_fd = -1;
+	if (imc_type_id()) {
+		unsigned int num = 0;
 
-struct ring {
-	const char *name;
-	uint32_t mmio;
-	int head, tail, size;
-	uint64_t full;
-	int idle;
-};
+		engines->imc_reads_scale = imc_data_reads_scale();
+		engines->imc_writes_scale = imc_data_writes_scale();
+
+		engines->imc_reads_unit = imc_data_reads_unit();
+		if (!engines->imc_reads_unit)
+			return -1;
+
+		engines->imc_writes_unit = imc_data_writes_unit();
+		if (!engines->imc_writes_unit)
+			return -1;
+
+		engines->imc_reads.config = imc_data_reads();
+		if (!engines->imc_reads.config)
+			return -1;
+
+		engines->imc_writes.config = imc_data_writes();
+		if (!engines->imc_writes.config)
+			return -1;
+
+		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
+		if (fd < 0)
+			return -1;
+
+		engines->imc_reads.present = true;
+		engines->imc_writes.present = true;
+	}
+
+	return 0;
+}
 
-static uint32_t ring_read(struct ring *ring, uint32_t reg)
+static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
 {
-	return INREG(ring->mmio + reg);
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = read(fd, buf, sizeof(buf));
+	assert(len == sizeof(buf));
+
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+
+	return buf[1];
 }
 
-static void ring_init(struct ring *ring)
+static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
 {
-	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
+	double v;
+
+	v = p->cur - p->prev;
+	v /= d;
+	v /= t;
+	v *= s;
+
+	if (s == 100.0 && v > 100.0)
+		v = 100.0;
+
+	return v;
 }
 
-static void ring_reset(struct ring *ring)
+static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
 {
-	ring->idle = ring->full = 0;
+	unsigned int i;
+
+	for (i = 0; i < num && i < (bufsz - 1); i++)
+		*buf++ = c;
+
+	*buf = 0;
 }
 
-static void ring_sample(struct ring *ring)
+static void pmu_calc(struct pmu_counter *cnt,
+		     char *buf, unsigned int bufsz,
+		     unsigned int width, unsigned width_dec,
+		     double d, double t, double s)
 {
-	int full;
+	double val;
+	int len;
+
+	assert(bufsz >= (width + width_dec + 1));
+
+	if (!cnt->present) {
+		fill_str(buf, bufsz, '-', width + width_dec);
+		return;
+	}
 
-	if (!ring->size)
+	val = __pmu_calc(&cnt->val, d, t, s);
+
+	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
+	if (len < 0 || len == bufsz) {
+		fill_str(buf, bufsz, 'X', width + width_dec);
 		return;
+	}
+}
+
+static uint64_t __pmu_read_single(int fd, uint64_t *ts)
+{
+	uint64_t data[2] = { };
+	ssize_t len;
 
-	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
-	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
+	len = read(fd, data, sizeof(data));
+	assert(len == sizeof(data));
 
-	if (ring->tail == ring->head)
-		ring->idle++;
+	if (ts)
+		*ts = data[1];
 
-	full = ring->tail - ring->head;
-	if (full < 0)
-		full += ring->size;
-	ring->full += full;
+	return data[0];
 }
 
-static void ring_print_header(FILE *out, struct ring *ring)
+static uint64_t pmu_read_single(int fd)
 {
-    fprintf(out, "%.6s%%\tops\t",
-            ring->name
-          );
+	return __pmu_read_single(fd, NULL);
 }
 
-static void ring_print(struct ring *ring, unsigned long samples_per_sec)
+static void __update_sample(struct pmu_counter *counter, uint64_t val)
 {
-	int percent_busy, len;
+	counter->val.prev = counter->val.cur;
+	counter->val.cur = val;
+}
 
-	if (!ring->size)
-		return;
+static void update_sample(struct pmu_counter *counter, uint64_t *val)
+{
+	if (counter->present)
+		__update_sample(counter, val[counter->idx]);
+}
+
+static void pmu_sample(struct engines *engines)
+{
+	const int num_val = engines->num_counters;
+	uint64_t val[2 + num_val];
+	unsigned int i;
+
+	engines->ts.prev = engines->ts.cur;
+
+	if (engines->rapl_fd >= 0)
+		__update_sample(&engines->rapl,
+				pmu_read_single(engines->rapl_fd));
+
+	if (engines->imc_fd >= 0) {
+		pmu_read_multi(engines->imc_fd, 2, val);
+		update_sample(&engines->imc_reads, val);
+		update_sample(&engines->imc_writes, val);
+	}
 
-	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
+	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
 
-	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
-	print_percentage_bar (percent_busy, len);
-	printf("%24s space: %d/%d\n",
-		   ring->name,
-		   (int)(ring->full / samples_per_sec),
-		   ring->size);
+	update_sample(&engines->freq_req, val);
+	update_sample(&engines->freq_act, val);
+	update_sample(&engines->irq, val);
+	update_sample(&engines->rc6, val);
+
+	for (i = 0; i < engines->num_engines; i++) {
+		struct engine *engine = engine_ptr(engines, i);
+
+		update_sample(&engine->busy, val);
+		update_sample(&engine->sema, val);
+		update_sample(&engine->wait, val);
+	}
 }
 
-static void ring_log(struct ring *ring, unsigned long samples_per_sec,
-		FILE *output)
+static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
+
+static void
+print_percentage_bar(double percent, int max_len)
 {
-	if (ring->size)
-		fprintf(output, "%3d\t%d\t",
-			(int)(100 - 100 * ring->idle / samples_per_sec),
-			(int)(ring->full / samples_per_sec));
-	else
-		fprintf(output, "-1\t-1\t");
+	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
+	int i;
+
+	putchar('|');
+
+	for (i = bar_len; i >= 8; i -= 8)
+		printf("%s", bars[8]);
+	if (i)
+		printf("%s", bars[i]);
+
+	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
+		putchar(' ');
+
+	putchar('|');
 }
 
+#define DEFAULT_PERIOD_MS (1000)
+
 static void
 usage(const char *appname)
 {
 	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
-			"\n"
-			"usage: %s [parameters]\n"
-			"\n"
-			"The following parameters apply:\n"
-			"[-s <samples>]       samples per seconds (default %d)\n"
-			"[-e <command>]       command to profile\n"
-			"[-o <file>]          output statistics to file. If file is '-',"
-			"                     run in batch mode and output statistics to stdio only \n"
-			"[-h]                 show this help screen\n"
-			"\n",
-			appname,
-			SAMPLES_PER_SEC
-		  );
-	return;
+		"\n"
+		"Usage: %s [parameters]\n"
+		"\n"
+		"\tThe following parameters are optional:\n\n"
+		"\t[-s <ms>]       Refresh period in milliseconds (default %ums).\n"
+		"\t[-h]            Show this help text.\n"
+		"\n",
+		appname, DEFAULT_PERIOD_MS);
 }
 
 int main(int argc, char **argv)
 {
-	uint32_t devid;
-	struct pci_device *pci_dev;
-	struct ring render_ring = {
-		.name = "render",
-		.mmio = 0x2030,
-	}, bsd_ring = {
-		.name = "bitstream",
-		.mmio = 0x4030,
-	}, bsd6_ring = {
-		.name = "bitstream",
-		.mmio = 0x12030,
-	}, blt_ring = {
-		.name = "blitter",
-		.mmio = 0x22030,
-	};
-	int i, ch;
-	int samples_per_sec = SAMPLES_PER_SEC;
-	FILE *output = NULL;
-	double elapsed_time=0;
-	int print_headers=1;
-	pid_t child_pid=-1;
-	int child_stat;
-	char *cmd=NULL;
-	int interactive=1;
-
-	/* Parse options? */
-	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
+	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
+	int con_w = -1, con_h = -1;
+	struct engines *engines;
+	unsigned int i;
+	int ret, ch;
+
+	/* Parse options */
+	while ((ch = getopt(argc, argv, "s:h")) != -1) {
 		switch (ch) {
-		case 'e': cmd = strdup(optarg);
-			break;
-		case 's': samples_per_sec = atoi(optarg);
-			if (samples_per_sec < 100) {
-				fprintf(stderr, "Error: samples per second must be >= 100\n");
-				exit(1);
-			}
-			break;
-		case 'o':
-			if (!strcmp(optarg, "-")) {
-				/* Running in non-interactive mode */
-				interactive = 0;
-				output = stdout;
-			}
-			else
-				output = fopen(optarg, "w");
-			if (!output)
-			{
-				perror("fopen");
-				exit(1);
-			}
+		case 's':
+			period_us = atoi(optarg) * 1000;
 			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
-			break;
 		default:
-			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
+			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
 			usage(argv[0]);
 			exit(1);
-			break;
 		}
 	}
 
-	pci_dev = intel_get_pci_device();
-	devid = pci_dev->device_id;
-	intel_mmio_use_pci_bar(pci_dev);
-	init_instdone_definitions(devid);
-
-	/* Do we have a command to run? */
-	if (cmd != NULL) {
-		if (output) {
-			fprintf(output, "# Profiling: %s\n", cmd);
-			fflush(output);
-		}
-		child_pid = fork();
-		if (child_pid < 0) {
-			perror("fork");
-			exit(1);
-		}
-		else if (child_pid == 0) {
-			int res;
-			res = system(cmd);
-			if (res < 0)
-				perror("running command");
-			if (output) {
-				fflush(output);
-				fprintf(output, "# %s exited with status %d\n", cmd, res);
-				fflush(output);
-			}
-			free(cmd);
-			exit(0);
-		} else {
-			free(cmd);
-		}
+	engines = discover_engines();
+	if (!engines) {
+		fprintf(stderr,
+			"Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
+			strerror(errno));
+		return 1;
 	}
 
-	for (i = 0; i < num_instdone_bits; i++) {
-		top_bits[i].bit = &instdone_bits[i];
-		top_bits[i].count = 0;
-		top_bits_sorted[i] = &top_bits[i];
+	ret = pmu_init(engines);
+	if (ret) {
+		fprintf(stderr,
+			"Failed to initialize PMU! (%s)\n", strerror(errno));
+		return 1;
 	}
 
-	/* Grab access to the registers */
-	intel_register_access_init(pci_dev, 0, -1);
+	pmu_sample(engines);
 
-	ring_init(&render_ring);
-	if (IS_GEN4(devid) || IS_GEN5(devid))
-		ring_init(&bsd_ring);
-	if (IS_GEN6(devid) || IS_GEN7(devid)) {
-		ring_init(&bsd6_ring);
-		ring_init(&blt_ring);
-	}
+	for (;;) {
+		double t;
+#define BUFSZ 16
+		char freq[BUFSZ];
+		char fact[BUFSZ];
+		char irq[BUFSZ];
+		char rc6[BUFSZ];
+		char power[BUFSZ];
+		char reads[BUFSZ];
+		char writes[BUFSZ];
+		struct winsize ws;
+		int lines = 0;
 
-	/* Initialize GPU stats */
-	if (HAS_STATS_REGS(devid)) {
-		for (i = 0; i < STATS_COUNT; i++) {
-			uint32_t stats_high, stats_low, stats_high_2;
+		/* Update terminal size. */
+		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
+			con_w = ws.ws_col;
+			con_h = ws.ws_row;
+		}
 
-			do {
-				stats_high = INREG(stats_regs[i] + 4);
-				stats_low = INREG(stats_regs[i]);
-				stats_high_2 = INREG(stats_regs[i] + 4);
-			} while (stats_high != stats_high_2);
+		pmu_sample(engines);
+		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
 
-			last_stats[i] = (uint64_t)stats_high << 32 |
-				stats_low;
-		}
-	}
+		printf("\033[H\033[J");
 
-	for (;;) {
-		int j;
-		unsigned long long t1, ti, tf, t2;
-		unsigned long long def_sleep = 1000000 / samples_per_sec;
-		unsigned long long last_samples_per_sec = samples_per_sec;
-		unsigned short int max_lines;
-		struct winsize ws;
-		char clear_screen[] = {0x1b, '[', 'H',
-				       0x1b, '[', 'J',
-				       0x0};
-		int percent;
-		int len;
-
-		t1 = gettime();
-
-		ring_reset(&render_ring);
-		ring_reset(&bsd_ring);
-		ring_reset(&bsd6_ring);
-		ring_reset(&blt_ring);
-
-		for (i = 0; i < samples_per_sec; i++) {
-			long long interval;
-			ti = gettime();
-			if (IS_965(devid)) {
-				instdone = INREG(INSTDONE_I965);
-				instdone1 = INREG(INSTDONE_1);
-			} else
-				instdone = INREG(INSTDONE);
-
-			for (j = 0; j < num_instdone_bits; j++)
-				update_idle_bit(&top_bits[j]);
-
-			ring_sample(&render_ring);
-			ring_sample(&bsd_ring);
-			ring_sample(&bsd6_ring);
-			ring_sample(&blt_ring);
-
-			tf = gettime();
-			if (tf - t1 >= 1000000) {
-				/* We are out of sync, bail out */
-				last_samples_per_sec = i+1;
-				break;
-			}
-			interval = def_sleep - (tf - ti);
-			if (interval > 0)
-				usleep(interval);
-		}
+		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
+		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
+		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
+		pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
+			 engines->rapl_scale);
+		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_reads_scale);
+		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
+			 engines->imc_writes_scale);
 
-		if (HAS_STATS_REGS(devid)) {
-			for (i = 0; i < STATS_COUNT; i++) {
-				uint32_t stats_high, stats_low, stats_high_2;
+		if (lines++ < con_h)
+			printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
+			       fact, freq, rc6, power, engines->rapl_unit, irq);
 
-				do {
-					stats_high = INREG(stats_regs[i] + 4);
-					stats_low = INREG(stats_regs[i]);
-					stats_high_2 = INREG(stats_regs[i] + 4);
-				} while (stats_high != stats_high_2);
+		if (lines++ < con_h)
+			printf("\n");
 
-				stats[i] = (uint64_t)stats_high << 32 |
-					stats_low;
-			}
-		}
+		if (engines->imc_fd) {
+			if (lines++ < con_h)
+				printf("      IMC reads:   %s %s/s\n",
+				       reads, engines->imc_reads_unit);
+
+			if (lines++ < con_h)
+				printf("     IMC writes:   %s %s/s\n",
+				       writes, engines->imc_writes_unit);
 
-		qsort(top_bits_sorted, num_instdone_bits,
-		      sizeof(struct top_bit *), top_bits_sort);
-
-		/* Limit the number of lines printed to the terminal height so the
-		 * most important info (at the top) will stay on screen. */
-		max_lines = -1;
-		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
-			max_lines = ws.ws_row - 6; /* exclude header lines */
-		if (max_lines >= num_instdone_bits)
-			max_lines = num_instdone_bits;
-
-		t2 = gettime();
-		elapsed_time += (t2 - t1) / 1000000.0;
-
-		if (interactive) {
-			printf("%s", clear_screen);
-			print_clock_info(pci_dev);
-
-			ring_print(&render_ring, last_samples_per_sec);
-			ring_print(&bsd_ring, last_samples_per_sec);
-			ring_print(&bsd6_ring, last_samples_per_sec);
-			ring_print(&blt_ring, last_samples_per_sec);
-
-			printf("\n%30s  %s\n", "task", "percent busy");
-			for (i = 0; i < max_lines; i++) {
-				if (top_bits_sorted[i]->count > 0) {
-					percent = (top_bits_sorted[i]->count * 100) /
-						last_samples_per_sec;
-					len = printf("%30s: %3d%%: ",
-							 top_bits_sorted[i]->bit->name,
-							 percent);
-					print_percentage_bar (percent, len);
-				} else {
-					printf("%*s", PERCENTAGE_BAR_END, "");
-				}
-
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					printf("%13s: %llu (%lld/sec)",
-						   stats_reg_names[i],
-						   (long long)stats[i],
-						   (long long)(stats[i] - last_stats[i]));
-					last_stats[i] = stats[i];
-				} else {
-					if (!top_bits_sorted[i]->count)
-						break;
-				}
+			if (++lines < con_h)
 				printf("\n");
-			}
 		}
-		if (output) {
-			/* Print headers for columns at first run */
-			if (print_headers) {
-				fprintf(output, "# time\t");
-				ring_print_header(output, &render_ring);
-				ring_print_header(output, &bsd_ring);
-				ring_print_header(output, &bsd6_ring);
-				ring_print_header(output, &blt_ring);
-				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-						fprintf(output, "%.6s\t",
-							   stats_reg_names[i]
-							   );
-					}
-					if (!top_bits[i].count)
-						continue;
-				}
-				fprintf(output, "\n");
-				print_headers = 0;
-			}
 
-			/* Print statistics */
-			fprintf(output, "%.2f\t", elapsed_time);
-			ring_log(&render_ring, last_samples_per_sec, output);
-			ring_log(&bsd_ring, last_samples_per_sec, output);
-			ring_log(&bsd6_ring, last_samples_per_sec, output);
-			ring_log(&blt_ring, last_samples_per_sec, output);
-
-			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
-				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
-					fprintf(output, "%"PRIu64"\t",
-						   stats[i] - last_stats[i]);
-					last_stats[i] = stats[i];
-				}
-					if (!top_bits[i].count)
-						continue;
-			}
-			fprintf(output, "\n");
-			fflush(output);
-		}
+		for (i = 0; i < engines->num_engines; i++) {
+			struct engine *engine = engine_ptr(engines, i);
 
-		for (i = 0; i < num_instdone_bits; i++) {
-			top_bits_sorted[i]->count = 0;
+			if (engine->num_counters && lines < con_h) {
+				const char *a = "          ENGINE      BUSY ";
+				const char *b = " MI_SEMA MI_WAIT";
 
-			if (i < STATS_COUNT)
-				last_stats[i] = stats[i];
+				printf("\033[7m%s%*s%s\033[0m\n",
+				       a,
+				       (int)(con_w - 1 - strlen(a) - strlen(b)),
+				       " ", b);
+				lines++;
+				break;
+			}
 		}
 
-		/* Check if child has gone */
-		if (child_pid > 0) {
-			int res;
-			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
-				perror("waitpid");
-				exit(1);
-			}
-			if (res == 0)
+		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
+			struct engine *engine = engine_ptr(engines, i);
+			unsigned int max_w = con_w - 1;
+			unsigned int len;
+			char sema[BUFSZ];
+			char wait[BUFSZ];
+			char busy[BUFSZ];
+			char buf[128];
+			double val;
+
+			if (!engine->num_counters)
 				continue;
-			if (WIFEXITED(child_stat))
-				break;
+
+			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
+			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
+			len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
+				       sema, wait);
+
+			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
+				 100);
+			len += printf("%16s %s%% ", engine->display_name, busy);
+
+			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
+			print_percentage_bar(val, max_w - len);
+
+			printf("%s\n", buf);
+
+			lines++;
 		}
-	}
 
-	fclose(output);
+		if (lines++ < con_h)
+			printf("\n");
+
+		usleep(period_us);
+	}
 
-	intel_register_access_fini();
 	return 0;
 }
diff --git a/tools/meson.build b/tools/meson.build
index bd2d313d5156..a918eeb0bef1 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -23,7 +23,6 @@ tools_progs = [
 	'intel_gpu_frequency',
 	'intel_firmware_decode',
 	'intel_gpu_time',
-	'intel_gpu_top',
 	'intel_gtt',
 	'intel_guc_logger',
 	'intel_infoframes',
@@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
 	       name_prefix : '',
 	       install : true)
 
+executable('intel_gpu_top', 'intel_gpu_top.c',
+	   install : true,
+	   install_rpath : rpathdir,
+	   dependencies : tool_deps + [ lib_igt_perf ])
+
 conf_data = configuration_data()
 conf_data.set('prefix', prefix)
 conf_data.set('exec_prefix', '${prefix}')
-- 
2.14.1

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [igt-dev] ✗ Fi.CI.BAT: failure for intel-gpu-top: Rewrite the tool to be safe to use (rev5)
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (11 preceding siblings ...)
  (?)
@ 2018-04-04 15:53 ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-04-04 15:53 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use (rev5)
URL   : https://patchwork.freedesktop.org/series/40826/
State : failure

== Summary ==

IGT patchset tested on top of latest successful build
e861c22db76c1c59547ccb36b668b9204201e81a tests/kms_getfb: Use fixtures and subtest groups

with latest DRM-Tip kernel build CI_DRM_4021
8a51883453a9 drm-tip: 2018y-04m-04d-14h-43m-05s UTC integration manifest

No testlist changes.

---- Possible new issues:

Test pm_rpm:
        Subgroup basic-pci-d3-state:
                pass       -> INCOMPLETE (fi-bxt-dsi)

---- Known issues:

Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                pass       -> FAIL       (fi-gdg-551) fdo#102575
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                pass       -> INCOMPLETE (fi-snb-2520m) fdo#103713

fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575
fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:431s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:441s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:382s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:537s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:299s
fi-bxt-dsi       total:248  pass:219  dwarn:0   dfail:0   fail:0   skip:28 
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:516s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:520s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:514s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:410s
fi-cfl-s3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:561s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:515s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:586s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:425s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:317s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:543s
fi-glk-j4005     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:484s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:406s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:421s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:478s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:438s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:471s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:463s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:516s
fi-pnv-d510      total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:643s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:445s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:539s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:510s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:505s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:436s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:446s
fi-snb-2520m     total:242  pass:208  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:404s
Blacklisted hosts:
fi-cnl-psr       total:285  pass:256  dwarn:3   dfail:0   fail:0   skip:26  time:524s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1224/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: warning for intel-gpu-top: Rewrite the tool to be safe to use (rev4)
  2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (12 preceding siblings ...)
  (?)
@ 2018-04-04 16:56 ` Patchwork
  -1 siblings, 0 replies; 57+ messages in thread
From: Patchwork @ 2018-04-04 16:56 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev

== Series Details ==

Series: intel-gpu-top: Rewrite the tool to be safe to use (rev4)
URL   : https://patchwork.freedesktop.org/series/40826/
State : warning

== Summary ==

---- Possible new issues:

Test gem_exec_parallel:
        Subgroup vebox-contexts:
                fail       -> PASS       (shard-apl)
Test gem_pwrite:
        Subgroup big-cpu-forwards:
                pass       -> SKIP       (shard-apl)

---- Known issues:

Test kms_cursor_legacy:
        Subgroup flip-vs-cursor-legacy:
                pass       -> FAIL       (shard-hsw) fdo#102670
Test kms_flip:
        Subgroup flip-vs-expired-vblank-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#102887
        Subgroup flip-vs-wf_vblank-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#100368

fdo#102670 https://bugs.freedesktop.org/show_bug.cgi?id=102670
fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368

shard-apl        total:3498 pass:1833 dwarn:1   dfail:0   fail:7   skip:1656 time:12991s
shard-hsw        total:3498 pass:1782 dwarn:1   dfail:0   fail:4   skip:1710 time:11532s
shard-snb        total:3498 pass:1377 dwarn:1   dfail:0   fail:2   skip:2118 time:7097s
Blacklisted hosts:
shard-kbl        total:3443 pass:1924 dwarn:1   dfail:0   fail:10  skip:1507 time:9127s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1222/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v6] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-04 15:26     ` [igt-dev] " Tvrtko Ursulin
@ 2018-04-09 12:26       ` Tvrtko Ursulin
  -1 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-09 12:26 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev
  Cc: Daniel Vetter, Eero Tamminen, Ben Widawsky, Intel-gfx


[Adding some people to Cc for more ack/nack type feedback.]

Executive question is ack or nack on replacing intel_gpu_top with a new 
implementation which uses only perf PMU for counter gathering.

A short history on how this came to be:

There was a recent external patch contribution from Rinat Ibragimov to 
support more platforms from the existing intel_gpu_top. But as the tool 
is not safe to use Chris Wilson suggested to maybe just replace it.

As it happens I had a good start to do this quickly and cheaply, in the 
form of one prototype I did recently, which only needed ripping some 
bits out, and polishing the rest.

Eero and Rinat kindly did a lot of platform coverage testing and the 
rewrite seems ready for next steps.

I need to stress that as the commit notes, the new tool has a slightly 
different scope as that it doesn't expose GPU functional level data, but 
only overall stats like power, frequencies, RC6, interrupts, IMC memory 
bandwidth and per command streamer busyness, mi_semaphore and mi_event 
waits. My thinking was that for more functional level profiling gpu-top 
(OA) should be used.

Also the "run a command" and CSV output features are not not supported 
since both can be done directly via perf stat.

Regards,

Tvrtko

On 04/04/2018 16:26, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
> 
> v2:
>   * Sort engines by class and instance.
>   * Do not wait for one sampling period to display something on screen.
>   * Move code out of the asserts. (Rinat Ibragimov)
>   * Continuously adapt to terminal size. (Rinat Ibragimov)
> 
> v3:
>   * Change layout and precision of some field. (Chris Wilson)
>   Eero Tamminen:
>   * Use more user friendly engine names.
>   * Don't error out if a counter is missing.
>   * Add IMC read/write bandwidth.
>   * Report minimum required kernel version.
> 
> v4:
>   * Really support 4.16 by skipping of missing engines.
>   * Simpler and less hacky float printing.
>   * Preserve copyright header. (Antonio Argenziano)
>   * Simplify engines_ptr macro. (Rinat Ibragimov)
> 
> v5:
>   * Get RAPL unit from sysfs.
>   * Consolidate sysfs paths with a macro.
>   * Tidy error handling by carrying over and reporting errno.
>   * Check against console height on all prints.
>   * More readable minimum kernel version message. (Eero Tamminen)
>   * Column banner for per engine stats. (Eero Tamminen)
> 
> v6:
>   * Man page update. (Eero Tamminen)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
> ---
>   lib/igt_perf.c        |    6 +
>   lib/igt_perf.h        |    1 +
>   man/intel_gpu_top.rst |   41 +-
>   tools/Makefile.am     |    2 +
>   tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
>   tools/meson.build     |    6 +-
>   6 files changed, 719 insertions(+), 587 deletions(-)
> 
> diff --git a/lib/igt_perf.c b/lib/igt_perf.c
> index 99d82ea51c9b..e3dec2cc29c7 100644
> --- a/lib/igt_perf.c
> +++ b/lib/igt_perf.c
> @@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
>   	return _perf_open(type, config, -1,
>   			  PERF_FORMAT_TOTAL_TIME_ENABLED);
>   }
> +
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group)
> +{
> +	return _perf_open(type, config, group,
> +			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
> +}
> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
> index 614ea5d23fa6..e00718f4769a 100644
> --- a/lib/igt_perf.h
> +++ b/lib/igt_perf.h
> @@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
>   int perf_i915_open(uint64_t config);
>   int perf_i915_open_group(uint64_t config, int group);
>   int igt_perf_open(uint64_t type, uint64_t config);
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group);
>   
>   #endif /* I915_PERF_H */
> diff --git a/man/intel_gpu_top.rst b/man/intel_gpu_top.rst
> index a5f7175bb1a0..19c712307d28 100644
> --- a/man/intel_gpu_top.rst
> +++ b/man/intel_gpu_top.rst
> @@ -7,9 +7,9 @@ Display a top-like summary of Intel GPU usage
>   ---------------------------------------------
>   .. include:: defs.rst
>   :Author: IGT Developers <igt-dev@lists.freedesktop.org>
> -:Date: 2016-03-01
> +:Date: 2018-04-04
>   :Version: |PACKAGE_STRING|
> -:Copyright: 2009,2011,2012,2016 Intel Corporation
> +:Copyright: 2009,2011,2012,2016,2018 Intel Corporation
>   :Manual section: |MANUAL_SECTION|
>   :Manual group: |MANUAL_GROUP|
>   
> @@ -21,42 +21,25 @@ SYNOPSIS
>   DESCRIPTION
>   ===========
>   
> -**intel_gpu_top** is a tool to display usage information of an Intel GPU. It
> -requires root privilege to map the graphics device.
> +**intel_gpu_top** is a tool to display usage information on Intel GPU's.
> +
> +The tool gathers data using perf performance counters (PMU) exposed by i915 and other platform drivers like RAPL (power) and Uncore IMC (memory bandwidth).
>   
>   OPTIONS
>   =======
>   
> --s SAMPLES
> -    Number of samples to acquire per second.
> -
> --o FILE
> -    Collect usage statistics to FILE. If file is "-", run non-interactively
> -    and output statistics to stdout.
> -
> --e COMMAND
> -    Execute COMMAND to profile, and leave when it is finished. Note that the
> -    entire command with all parameters should be included as one parameter.
> +-s <ms>
> +    Refresh period in milliseconds.
>   
>   -h
> -    Show usage notes.
> +    Show help text.
>   
> -EXAMPLES
> -========
> -
> -intel_gpu_top -o "cairo-trace-gvim.log" -s 100 -e "cairo-perf-trace /tmp/gvim"
> -    Run cairo-perf-trace with /tmp/gvim trace, non-interactively, saving the
> -    statistics into cairo-trace-gvim.log file, and collecting 100 samples per
> -    second.
> -
> -Note that idle units are not displayed, so an entirely idle GPU will only
> -display the ring status and header.
> +LIMITATIONS
> +===========
>   
> -BUGS
> -====
> +* Not all metrics are supported on all platforms. Where a metric is unsupported it's value will be replaced by a dashed line.
>   
> -Some GPUs report some units as busy when they aren't, such that even when idle
> -and not hung, it will show up as 100% busy.
> +* Non-root access to perf counters is controlled by the *perf_event_paranoid* sysctl.
>   
>   REPORTING BUGS
>   ==============
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>   intel_aubdump_la_SOURCES = aubdump.c
>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>   
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>   bin_SCRIPTS = intel_aubdump
>   CLEANFILES = $(bin_SCRIPTS)
>   
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..b923c3cfbe97 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>   /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2007-2018 Intel Corporation
>    *
>    * Permission is hereby granted, free of charge, to any person obtaining a
>    * copy of this software and associated documentation files (the "Software"),
> @@ -24,695 +23,832 @@
>    * Authors:
>    *    Eric Anholt <eric@anholt.net>
>    *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
>    */
>   
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>   #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>   #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE	    0xA18C
> -#define  FORCEWAKE_ACK	    0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)		IS_965(devid)
> -
> -struct top_bit {
> -	struct instdone_bit *bit;
> -	int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -	" ",
> -	"▏",
> -	"▎",
> -	"▍",
> -	"▌",
> -	"▋",
> -	"▊",
> -	"▉",
> -	"█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>   
> -enum stats_counts {
> -	IA_VERTICES,
> -	IA_PRIMITIVES,
> -	VS_INVOCATION,
> -	GS_INVOCATION,
> -	GS_PRIMITIVES,
> -	CL_INVOCATION,
> -	CL_PRIMITIVES,
> -	PS_INVOCATION,
> -	PS_DEPTH,
> -	STATS_COUNT
> +struct pmu_pair {
> +	uint64_t cur;
> +	uint64_t prev;
>   };
>   
> -const uint32_t stats_regs[STATS_COUNT] = {
> -	IA_VERTICES_COUNT_QW,
> -	IA_PRIMITIVES_COUNT_QW,
> -	VS_INVOCATION_COUNT_QW,
> -	GS_INVOCATION_COUNT_QW,
> -	GS_PRIMITIVES_COUNT_QW,
> -	CL_INVOCATION_COUNT_QW,
> -	CL_PRIMITIVES_COUNT_QW,
> -	PS_INVOCATION_COUNT_QW,
> -	PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +	bool present;
> +	uint64_t config;
> +	unsigned int idx;
> +	struct pmu_pair val;
>   };
>   
> -const char *stats_reg_names[STATS_COUNT] = {
> -	"vert fetch",
> -	"prim fetch",
> -	"VS invocations",
> -	"GS invocations",
> -	"GS prims",
> -	"CL invocations",
> -	"CL prims",
> -	"PS invocations",
> -	"PS depth pass",
> +struct engine {
> +	const char *name;
> +	const char *display_name;
> +
> +	unsigned int class;
> +	unsigned int instance;
> +
> +	unsigned int num_counters;
> +
> +	struct pmu_counter busy;
> +	struct pmu_counter wait;
> +	struct pmu_counter sema;
>   };
>   
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +	unsigned int num_engines;
> +	unsigned int num_counters;
> +	DIR *root;
> +	int fd;
> +	struct pmu_pair ts;
> +
> +	int rapl_fd;
> +	double rapl_scale;
> +	const char *rapl_unit;
> +
> +	int imc_fd;
> +	double imc_reads_scale;
> +	const char *imc_reads_unit;
> +	double imc_writes_scale;
> +	const char *imc_writes_unit;
> +
> +	struct pmu_counter freq_req;
> +	struct pmu_counter freq_act;
> +	struct pmu_counter irq;
> +	struct pmu_counter rc6;
> +	struct pmu_counter rapl;
> +	struct pmu_counter imc_reads;
> +	struct pmu_counter imc_writes;
> +
> +	struct engine engine;
> +};
>   
> -static unsigned long
> -gettime(void)
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>   {
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +	char buf[128], *p;
> +	int fd, ret;
>   
> -static int
> -top_bits_sort(const void *a, const void *b)
> -{
> -	struct top_bit * const *bit_a = a;
> -	struct top_bit * const *bit_b = b;
> -	int a_count = (*bit_a)->count;
> -	int b_count = (*bit_b)->count;
> +	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +	if (ret < 0 || ret == sizeof(buf))
> +		return -1;
>   
> -	if (a_count < b_count)
> -		return 1;
> -	else if (a_count == b_count)
> -		return 0;
> -	else
> +	fd = openat(dirfd, buf, O_RDONLY);
> +	if (fd < 0)
>   		return -1;
> -}
>   
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -	uint32_t reg_val;
> +	ret = read(fd, buf, sizeof(buf));
> +	close(fd);
> +	if (ret <= 0)
> +		return -1;
>   
> -	if (top_bit->bit->reg == INSTDONE_1)
> -		reg_val = instdone1;
> -	else
> -		reg_val = instdone;
> +	p = index(buf, '0');
> +	if (!p)
> +		return -1;
>   
> -	if ((reg_val & top_bit->bit->bit) == 0)
> -		top_bit->count++;
> +	return strtoul(p, NULL, 0);
>   }
>   
> -static void
> -print_clock(const char *name, int clock) {
> -	if (clock == -1)
> -		printf("%s clock: unknown", name);
> +#define engine_ptr(engines, n) (&engines->engine + (n))
> +
> +static const char *class_display_name(unsigned int class)
> +{
> +	switch (class) {
> +	case I915_ENGINE_CLASS_RENDER:
> +		return "Render/3D";
> +	case I915_ENGINE_CLASS_COPY:
> +		return "Blitter";
> +	case I915_ENGINE_CLASS_VIDEO:
> +		return "Video";
> +	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
> +		return "VideoEnhance";
> +	default:
> +		return "[unknown]";
> +	}
> +}
> +
> +static int engine_cmp(const void *__a, const void *__b)
> +{
> +	const struct engine *a = (struct engine *)__a;
> +	const struct engine *b = (struct engine *)__b;
> +
> +	if (a->class != b->class)
> +		return a->class - b->class;
>   	else
> -		printf("%s clock: %d Mhz", name, clock);
> +		return a->instance - b->instance;
>   }
>   
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +static struct engines *discover_engines(void)
>   {
> -	uint32_t devid = pci_dev->device_id;
> -	uint16_t gcfgc;
> +	const char *sysfs_root = "/sys/devices/i915/events";
> +	struct engines *engines;
> +	struct dirent *dent;
> +	int ret = 0;
> +	DIR *d;
>   
> -	if (IS_GM45(devid)) {
> -		int core_clock = -1;
> +	engines = malloc(sizeof(struct engines));
> +	if (!engines)
> +		return NULL;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	memset(engines, 0, sizeof(*engines));
>   
> -		switch (gcfgc & 0xf) {
> -		case 8:
> -			core_clock = 266;
> -			break;
> -		case 9:
> -			core_clock = 320;
> -			break;
> -		case 11:
> -			core_clock = 400;
> -			break;
> -		case 13:
> -			core_clock = 533;
> -			break;
> -		}
> -		print_clock("core", core_clock);
> -	} else if (IS_965(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, sampler_clock = -1;
> +	engines->num_engines = 0;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	d = opendir(sysfs_root);
> +	if (!d)
> +		return NULL;
>   
> -		switch (gcfgc & 0xf) {
> -		case 2:
> -			render_clock = 250; sampler_clock = 267;
> -			break;
> -		case 3:
> -			render_clock = 320; sampler_clock = 333;
> -			break;
> -		case 4:
> -			render_clock = 400; sampler_clock = 444;
> -			break;
> -		case 5:
> -			render_clock = 500; sampler_clock = 533;
> +	while ((dent = readdir(d)) != NULL) {
> +		const char *endswith = "-busy";
> +		const unsigned int endlen = strlen(endswith);
> +		struct engine *engine =
> +				engine_ptr(engines, engines->num_engines);
> +		char buf[256];
> +
> +		if (dent->d_type != DT_REG)
> +			continue;
> +
> +		if (strlen(dent->d_name) >= sizeof(buf)) {
> +			ret = ENAMETOOLONG;
>   			break;
>   		}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("sampler", sampler_clock);
> -	} else if (IS_945(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		strcpy(buf, dent->d_name);
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		/* xxxN-busy */
> +		if (strlen(buf) < (endlen + 4))
> +			continue;
> +		if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +			continue;
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 166;
> -			break;
> -		case 1:
> -			render_clock = 200;
> -			break;
> -		case 3:
> -			render_clock = 250;
> -			break;
> -		case 5:
> -			render_clock = 400;
> +		memset(engine, 0, sizeof(*engine));
> +
> +		buf[strlen(buf) - endlen] = 0;
> +		engine->name = strdup(buf);
> +		if (!engine->name) {
> +			ret = errno;
>   			break;
>   		}
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 200;
> -			break;
> -		case 4:
> -			display_clock = 320;
> +		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +						     "busy");
> +		if (engine->busy.config == -1) {
> +			ret = ENOENT;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	} else if (IS_915(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		engine->class = (engine->busy.config &
> +				 (__I915_PMU_OTHER(0) - 1)) >>
> +				I915_PMU_CLASS_SHIFT;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		engine->instance = (engine->busy.config >>
> +				    I915_PMU_SAMPLE_BITS) &
> +				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 160;
> -			break;
> -		case 1:
> -			render_clock = 190;
> -			break;
> -		case 4:
> -			render_clock = 333;
> +		ret = snprintf(buf, sizeof(buf), "%s/%u",
> +			       class_display_name(engine->class),
> +			       engine->instance);
> +		if (ret < 0 || ret == sizeof(buf)) {
> +			ret = ENOBUFS;
>   			break;
>   		}
> -		if (gcfgc & (1 << 13))
> -		    render_clock = 133;
> +		ret = 0;
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 190;
> +		engine->display_name = strdup(buf);
> +		if (!engine->display_name) {
> +			ret = errno;
>   			break;
> -		case 4:
> -			display_clock = 333;
> +		}
> +
> +		engines->num_engines++;
> +		engines = realloc(engines, sizeof(struct engines) +
> +				  engines->num_engines * sizeof(struct engine));
> +		if (!engines) {
> +			ret = errno;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	}
> +
> +	if (ret) {
> +		free(engines);
> +		errno = ret;
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> +		return NULL;
>   	}
>   
> +	qsort(engine_ptr(engines, 0), engines->num_engines,
> +	      sizeof(struct engine), engine_cmp);
> +
> +	engines->root = d;
>   
> -	printf("\n");
> -	return -1;
> +	return engines;
>   }
>   
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +	int fd, err;
> +	ssize_t ret;
>   
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +	fd = open(filename, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
> +
> +	ret = read(fd, buf, bufsize - 1);
> +	err = errno;
> +	close(fd);
> +	if (ret < 1) {
> +		errno = ret < 0 ? err : ENOMSG;
> +
> +		return -1;
> +	}
> +
> +	if (ret > 1 && buf[ret - 1] == '\n')
> +		buf[ret - 1] = '\0';
> +	else
> +		buf[ret] = '\0';
> +
> +	return 0;
> +}
> +
> +static uint64_t filename_to_u64(const char *filename, int base)
>   {
> -	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -	int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -	int i;
> +	char buf[64], *b;
>   
> -	for (i = bar_len; i >= 8; i -= 8) {
> -		printf("%s", bars[8]);
> -		cur_line_len++;
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	/*
> +	 * Handle both single integer and key=value formats by skipping
> +	 * leading non-digits.
> +	 */
> +	b = buf;
> +	while (*b && !isdigit(*b))
> +		b++;
> +
> +	return strtoull(b, NULL, base);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +	char *oldlocale;
> +	char buf[80];
> +	double v;
> +
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	oldlocale = setlocale(LC_ALL, "C");
> +	v = strtod(buf, NULL);
> +	setlocale(LC_ALL, oldlocale);
> +
> +	return v;
> +}
> +
> +#define RAPL_ROOT "/sys/devices/power/"
> +#define RAPL_EVENT "/sys/devices/power/events/"
> +
> +static uint64_t rapl_type_id(void)
> +{
> +	return filename_to_u64(RAPL_ROOT "type", 10);
> +}
> +
> +static uint64_t rapl_gpu_power(void)
> +{
> +	return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
> +}
> +
> +static double rapl_gpu_power_scale(void)
> +{
> +	return filename_to_double(RAPL_EVENT "energy-gpu.scale");
> +}
> +
> +static const char *rapl_gpu_power_unit(void)
> +{
> +	char buf[32];
> +
> +	if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
> +			    buf, sizeof(buf)) == 0)
> +		if (!strcmp(buf, "Joules"))
> +			return strdup("Watts");
> +		else
> +			return strdup(buf);
> +	else
> +		return NULL;
> +}
> +
> +#define IMC_ROOT "/sys/devices/uncore_imc/"
> +#define IMC_EVENT "/sys/devices/uncore_imc/events/"
> +
> +static uint64_t imc_type_id(void)
> +{
> +	return filename_to_u64(IMC_ROOT "type", 10);
> +}
> +
> +static uint64_t imc_data_reads(void)
> +{
> +	return filename_to_u64(IMC_EVENT "data_reads", 0);
> +}
> +
> +static double imc_data_reads_scale(void)
> +{
> +	return filename_to_double(IMC_EVENT "data_reads.scale");
> +}
> +
> +static const char *imc_data_reads_unit(void)
> +{
> +	char buf[32];
> +
> +	if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
> +		return strdup(buf);
> +	else
> +		return NULL;
> +}
> +
> +static uint64_t imc_data_writes(void)
> +{
> +	return filename_to_u64(IMC_EVENT "data_writes", 0);
> +}
> +
> +static double imc_data_writes_scale(void)
> +{
> +	return filename_to_double(IMC_EVENT "data_writes.scale");
> +}
> +
> +static const char *imc_data_writes_unit(void)
> +{
> +	char buf[32];
> +
> +	if (filename_to_buf(IMC_EVENT "data_writes.unit",
> +			    buf, sizeof(buf)) == 0)
> +		return strdup(buf);
> +	else
> +		return NULL;
> +}
> +
> +#define _open_pmu(cnt, pmu, fd) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
> +	if (fd__ >= 0) { \
> +		if ((fd) == -1) \
> +			(fd) = fd__; \
> +		(pmu)->present = true; \
> +		(pmu)->idx = (cnt)++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +#define _open_imc(cnt, pmu, fd) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
> +	if (fd__ >= 0) { \
> +		if ((fd) == -1) \
> +			(fd) = fd__; \
> +		(pmu)->present = true; \
> +		(pmu)->idx = (cnt)++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
> +{
> +	unsigned int i;
> +	int fd;
> +
> +	engines->fd = -1;
> +	engines->num_counters = 0;
> +
> +	engines->irq.config = I915_PMU_INTERRUPTS;
> +	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
> +
> +	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
> +
> +	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +		struct {
> +			struct pmu_counter *pmu;
> +			const char *counter;
> +		} *cnt, counters[] = {
> +			{ .pmu = &engine->busy, .counter = "busy" },
> +			{ .pmu = &engine->wait, .counter = "wait" },
> +			{ .pmu = &engine->sema, .counter = "sema" },
> +			{ .pmu = NULL, .counter = NULL },
> +		};
> +
> +		for (cnt = counters; cnt->pmu; cnt++) {
> +			if (!cnt->pmu->config)
> +				cnt->pmu->config =
> +					get_pmu_config(dirfd(engines->root),
> +						       engine->name,
> +						       cnt->counter);
> +			fd = _open_pmu(engines->num_counters, cnt->pmu,
> +				       engines->fd);
> +			if (fd >= 0)
> +				engine->num_counters++;
> +		}
>   	}
> -	if (i) {
> -		printf("%s", bars[i]);
> -		cur_line_len++;
> +
> +	engines->rapl_fd = -1;
> +	if (rapl_type_id()) {
> +		engines->rapl_scale = rapl_gpu_power_scale();
> +		engines->rapl_unit = rapl_gpu_power_unit();
> +		if (!engines->rapl_unit)
> +			return -1;
> +
> +		engines->rapl.config = rapl_gpu_power();
> +		if (!engines->rapl.config)
> +			return -1;
> +
> +		engines->rapl_fd = igt_perf_open(rapl_type_id(),
> +						 engines->rapl.config);
> +		if (engines->rapl_fd < 0)
> +			return -1;
> +
> +		engines->rapl.present = true;
>   	}
>   
> -	/* NB: We can't use a field width with utf8 so we manually
> -	* guarantee a field with of 45 chars for any bar. */
> -	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> -}
> +	engines->imc_fd = -1;
> +	if (imc_type_id()) {
> +		unsigned int num = 0;
>   
> -struct ring {
> -	const char *name;
> -	uint32_t mmio;
> -	int head, tail, size;
> -	uint64_t full;
> -	int idle;
> -};
> +		engines->imc_reads_scale = imc_data_reads_scale();
> +		engines->imc_writes_scale = imc_data_writes_scale();
> +
> +		engines->imc_reads_unit = imc_data_reads_unit();
> +		if (!engines->imc_reads_unit)
> +			return -1;
> +
> +		engines->imc_writes_unit = imc_data_writes_unit();
> +		if (!engines->imc_writes_unit)
> +			return -1;
> +
> +		engines->imc_reads.config = imc_data_reads();
> +		if (!engines->imc_reads.config)
> +			return -1;
> +
> +		engines->imc_writes.config = imc_data_writes();
> +		if (!engines->imc_writes.config)
> +			return -1;
> +
> +		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
> +		if (fd < 0)
> +			return -1;
> +		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
> +		if (fd < 0)
> +			return -1;
> +
> +		engines->imc_reads.present = true;
> +		engines->imc_writes.present = true;
> +	}
> +
> +	return 0;
> +}
>   
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>   {
> -	return INREG(ring->mmio + reg);
> +	uint64_t buf[2 + num];
> +	unsigned int i;
> +	ssize_t len;
> +
> +	memset(buf, 0, sizeof(buf));
> +
> +	len = read(fd, buf, sizeof(buf));
> +	assert(len == sizeof(buf));
> +
> +	for (i = 0; i < num; i++)
> +		val[i] = buf[2 + i];
> +
> +	return buf[1];
>   }
>   
> -static void ring_init(struct ring *ring)
> +static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
>   {
> -	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +	double v;
> +
> +	v = p->cur - p->prev;
> +	v /= d;
> +	v /= t;
> +	v *= s;
> +
> +	if (s == 100.0 && v > 100.0)
> +		v = 100.0;
> +
> +	return v;
>   }
>   
> -static void ring_reset(struct ring *ring)
> +static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
>   {
> -	ring->idle = ring->full = 0;
> +	unsigned int i;
> +
> +	for (i = 0; i < num && i < (bufsz - 1); i++)
> +		*buf++ = c;
> +
> +	*buf = 0;
>   }
>   
> -static void ring_sample(struct ring *ring)
> +static void pmu_calc(struct pmu_counter *cnt,
> +		     char *buf, unsigned int bufsz,
> +		     unsigned int width, unsigned width_dec,
> +		     double d, double t, double s)
>   {
> -	int full;
> +	double val;
> +	int len;
> +
> +	assert(bufsz >= (width + width_dec + 1));
> +
> +	if (!cnt->present) {
> +		fill_str(buf, bufsz, '-', width + width_dec);
> +		return;
> +	}
>   
> -	if (!ring->size)
> +	val = __pmu_calc(&cnt->val, d, t, s);
> +
> +	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
> +	if (len < 0 || len == bufsz) {
> +		fill_str(buf, bufsz, 'X', width + width_dec);
>   		return;
> +	}
> +}
> +
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
> +{
> +	uint64_t data[2] = { };
> +	ssize_t len;
>   
> -	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +	len = read(fd, data, sizeof(data));
> +	assert(len == sizeof(data));
>   
> -	if (ring->tail == ring->head)
> -		ring->idle++;
> +	if (ts)
> +		*ts = data[1];
>   
> -	full = ring->tail - ring->head;
> -	if (full < 0)
> -		full += ring->size;
> -	ring->full += full;
> +	return data[0];
>   }
>   
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static uint64_t pmu_read_single(int fd)
>   {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +	return __pmu_read_single(fd, NULL);
>   }
>   
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>   {
> -	int percent_busy, len;
> +	counter->val.prev = counter->val.cur;
> +	counter->val.cur = val;
> +}
>   
> -	if (!ring->size)
> -		return;
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
> +{
> +	if (counter->present)
> +		__update_sample(counter, val[counter->idx]);
> +}
> +
> +static void pmu_sample(struct engines *engines)
> +{
> +	const int num_val = engines->num_counters;
> +	uint64_t val[2 + num_val];
> +	unsigned int i;
> +
> +	engines->ts.prev = engines->ts.cur;
> +
> +	if (engines->rapl_fd >= 0)
> +		__update_sample(&engines->rapl,
> +				pmu_read_single(engines->rapl_fd));
> +
> +	if (engines->imc_fd >= 0) {
> +		pmu_read_multi(engines->imc_fd, 2, val);
> +		update_sample(&engines->imc_reads, val);
> +		update_sample(&engines->imc_writes, val);
> +	}
>   
> -	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>   
> -	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -	print_percentage_bar (percent_busy, len);
> -	printf("%24s space: %d/%d\n",
> -		   ring->name,
> -		   (int)(ring->full / samples_per_sec),
> -		   ring->size);
> +	update_sample(&engines->freq_req, val);
> +	update_sample(&engines->freq_act, val);
> +	update_sample(&engines->irq, val);
> +	update_sample(&engines->rc6, val);
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +
> +		update_sample(&engine->busy, val);
> +		update_sample(&engine->sema, val);
> +		update_sample(&engine->wait, val);
> +	}
>   }
>   
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -		FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>   {
> -	if (ring->size)
> -		fprintf(output, "%3d\t%d\t",
> -			(int)(100 - 100 * ring->idle / samples_per_sec),
> -			(int)(ring->full / samples_per_sec));
> -	else
> -		fprintf(output, "-1\t-1\t");
> +	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +	int i;
> +
> +	putchar('|');
> +
> +	for (i = bar_len; i >= 8; i -= 8)
> +		printf("%s", bars[8]);
> +	if (i)
> +		printf("%s", bars[i]);
> +
> +	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +		putchar(' ');
> +
> +	putchar('|');
>   }
>   
> +#define DEFAULT_PERIOD_MS (1000)
> +
>   static void
>   usage(const char *appname)
>   {
>   	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -			"\n"
> -			"usage: %s [parameters]\n"
> -			"\n"
> -			"The following parameters apply:\n"
> -			"[-s <samples>]       samples per seconds (default %d)\n"
> -			"[-e <command>]       command to profile\n"
> -			"[-o <file>]          output statistics to file. If file is '-',"
> -			"                     run in batch mode and output statistics to stdio only \n"
> -			"[-h]                 show this help screen\n"
> -			"\n",
> -			appname,
> -			SAMPLES_PER_SEC
> -		  );
> -	return;
> +		"\n"
> +		"Usage: %s [parameters]\n"
> +		"\n"
> +		"\tThe following parameters are optional:\n\n"
> +		"\t[-s <ms>]       Refresh period in milliseconds (default %ums).\n"
> +		"\t[-h]            Show this help text.\n"
> +		"\n",
> +		appname, DEFAULT_PERIOD_MS);
>   }
>   
>   int main(int argc, char **argv)
>   {
> -	uint32_t devid;
> -	struct pci_device *pci_dev;
> -	struct ring render_ring = {
> -		.name = "render",
> -		.mmio = 0x2030,
> -	}, bsd_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x4030,
> -	}, bsd6_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x12030,
> -	}, blt_ring = {
> -		.name = "blitter",
> -		.mmio = 0x22030,
> -	};
> -	int i, ch;
> -	int samples_per_sec = SAMPLES_PER_SEC;
> -	FILE *output = NULL;
> -	double elapsed_time=0;
> -	int print_headers=1;
> -	pid_t child_pid=-1;
> -	int child_stat;
> -	char *cmd=NULL;
> -	int interactive=1;
> -
> -	/* Parse options? */
> -	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +	int con_w = -1, con_h = -1;
> +	struct engines *engines;
> +	unsigned int i;
> +	int ret, ch;
> +
> +	/* Parse options */
> +	while ((ch = getopt(argc, argv, "s:h")) != -1) {
>   		switch (ch) {
> -		case 'e': cmd = strdup(optarg);
> -			break;
> -		case 's': samples_per_sec = atoi(optarg);
> -			if (samples_per_sec < 100) {
> -				fprintf(stderr, "Error: samples per second must be >= 100\n");
> -				exit(1);
> -			}
> -			break;
> -		case 'o':
> -			if (!strcmp(optarg, "-")) {
> -				/* Running in non-interactive mode */
> -				interactive = 0;
> -				output = stdout;
> -			}
> -			else
> -				output = fopen(optarg, "w");
> -			if (!output)
> -			{
> -				perror("fopen");
> -				exit(1);
> -			}
> +		case 's':
> +			period_us = atoi(optarg) * 1000;
>   			break;
>   		case 'h':
>   			usage(argv[0]);
>   			exit(0);
> -			break;
>   		default:
> -			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>   			usage(argv[0]);
>   			exit(1);
> -			break;
>   		}
>   	}
>   
> -	pci_dev = intel_get_pci_device();
> -	devid = pci_dev->device_id;
> -	intel_mmio_use_pci_bar(pci_dev);
> -	init_instdone_definitions(devid);
> -
> -	/* Do we have a command to run? */
> -	if (cmd != NULL) {
> -		if (output) {
> -			fprintf(output, "# Profiling: %s\n", cmd);
> -			fflush(output);
> -		}
> -		child_pid = fork();
> -		if (child_pid < 0) {
> -			perror("fork");
> -			exit(1);
> -		}
> -		else if (child_pid == 0) {
> -			int res;
> -			res = system(cmd);
> -			if (res < 0)
> -				perror("running command");
> -			if (output) {
> -				fflush(output);
> -				fprintf(output, "# %s exited with status %d\n", cmd, res);
> -				fflush(output);
> -			}
> -			free(cmd);
> -			exit(0);
> -		} else {
> -			free(cmd);
> -		}
> +	engines = discover_engines();
> +	if (!engines) {
> +		fprintf(stderr,
> +			"Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
> +			strerror(errno));
> +		return 1;
>   	}
>   
> -	for (i = 0; i < num_instdone_bits; i++) {
> -		top_bits[i].bit = &instdone_bits[i];
> -		top_bits[i].count = 0;
> -		top_bits_sorted[i] = &top_bits[i];
> +	ret = pmu_init(engines);
> +	if (ret) {
> +		fprintf(stderr,
> +			"Failed to initialize PMU! (%s)\n", strerror(errno));
> +		return 1;
>   	}
>   
> -	/* Grab access to the registers */
> -	intel_register_access_init(pci_dev, 0, -1);
> +	pmu_sample(engines);
>   
> -	ring_init(&render_ring);
> -	if (IS_GEN4(devid) || IS_GEN5(devid))
> -		ring_init(&bsd_ring);
> -	if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -		ring_init(&bsd6_ring);
> -		ring_init(&blt_ring);
> -	}
> +	for (;;) {
> +		double t;
> +#define BUFSZ 16
> +		char freq[BUFSZ];
> +		char fact[BUFSZ];
> +		char irq[BUFSZ];
> +		char rc6[BUFSZ];
> +		char power[BUFSZ];
> +		char reads[BUFSZ];
> +		char writes[BUFSZ];
> +		struct winsize ws;
> +		int lines = 0;
>   
> -	/* Initialize GPU stats */
> -	if (HAS_STATS_REGS(devid)) {
> -		for (i = 0; i < STATS_COUNT; i++) {
> -			uint32_t stats_high, stats_low, stats_high_2;
> +		/* Update terminal size. */
> +		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +			con_w = ws.ws_col;
> +			con_h = ws.ws_row;
> +		}
>   
> -			do {
> -				stats_high = INREG(stats_regs[i] + 4);
> -				stats_low = INREG(stats_regs[i]);
> -				stats_high_2 = INREG(stats_regs[i] + 4);
> -			} while (stats_high != stats_high_2);
> +		pmu_sample(engines);
> +		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>   
> -			last_stats[i] = (uint64_t)stats_high << 32 |
> -				stats_low;
> -		}
> -	}
> +		printf("\033[H\033[J");
>   
> -	for (;;) {
> -		int j;
> -		unsigned long long t1, ti, tf, t2;
> -		unsigned long long def_sleep = 1000000 / samples_per_sec;
> -		unsigned long long last_samples_per_sec = samples_per_sec;
> -		unsigned short int max_lines;
> -		struct winsize ws;
> -		char clear_screen[] = {0x1b, '[', 'H',
> -				       0x1b, '[', 'J',
> -				       0x0};
> -		int percent;
> -		int len;
> -
> -		t1 = gettime();
> -
> -		ring_reset(&render_ring);
> -		ring_reset(&bsd_ring);
> -		ring_reset(&bsd6_ring);
> -		ring_reset(&blt_ring);
> -
> -		for (i = 0; i < samples_per_sec; i++) {
> -			long long interval;
> -			ti = gettime();
> -			if (IS_965(devid)) {
> -				instdone = INREG(INSTDONE_I965);
> -				instdone1 = INREG(INSTDONE_1);
> -			} else
> -				instdone = INREG(INSTDONE);
> -
> -			for (j = 0; j < num_instdone_bits; j++)
> -				update_idle_bit(&top_bits[j]);
> -
> -			ring_sample(&render_ring);
> -			ring_sample(&bsd_ring);
> -			ring_sample(&bsd6_ring);
> -			ring_sample(&blt_ring);
> -
> -			tf = gettime();
> -			if (tf - t1 >= 1000000) {
> -				/* We are out of sync, bail out */
> -				last_samples_per_sec = i+1;
> -				break;
> -			}
> -			interval = def_sleep - (tf - ti);
> -			if (interval > 0)
> -				usleep(interval);
> -		}
> +		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
> +		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
> +		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
> +		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
> +		pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
> +			 engines->rapl_scale);
> +		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
> +			 engines->imc_reads_scale);
> +		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
> +			 engines->imc_writes_scale);
>   
> -		if (HAS_STATS_REGS(devid)) {
> -			for (i = 0; i < STATS_COUNT; i++) {
> -				uint32_t stats_high, stats_low, stats_high_2;
> +		if (lines++ < con_h)
> +			printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
> +			       fact, freq, rc6, power, engines->rapl_unit, irq);
>   
> -				do {
> -					stats_high = INREG(stats_regs[i] + 4);
> -					stats_low = INREG(stats_regs[i]);
> -					stats_high_2 = INREG(stats_regs[i] + 4);
> -				} while (stats_high != stats_high_2);
> +		if (lines++ < con_h)
> +			printf("\n");
>   
> -				stats[i] = (uint64_t)stats_high << 32 |
> -					stats_low;
> -			}
> -		}
> +		if (engines->imc_fd) {
> +			if (lines++ < con_h)
> +				printf("      IMC reads:   %s %s/s\n",
> +				       reads, engines->imc_reads_unit);
> +
> +			if (lines++ < con_h)
> +				printf("     IMC writes:   %s %s/s\n",
> +				       writes, engines->imc_writes_unit);
>   
> -		qsort(top_bits_sorted, num_instdone_bits,
> -		      sizeof(struct top_bit *), top_bits_sort);
> -
> -		/* Limit the number of lines printed to the terminal height so the
> -		 * most important info (at the top) will stay on screen. */
> -		max_lines = -1;
> -		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -			max_lines = ws.ws_row - 6; /* exclude header lines */
> -		if (max_lines >= num_instdone_bits)
> -			max_lines = num_instdone_bits;
> -
> -		t2 = gettime();
> -		elapsed_time += (t2 - t1) / 1000000.0;
> -
> -		if (interactive) {
> -			printf("%s", clear_screen);
> -			print_clock_info(pci_dev);
> -
> -			ring_print(&render_ring, last_samples_per_sec);
> -			ring_print(&bsd_ring, last_samples_per_sec);
> -			ring_print(&bsd6_ring, last_samples_per_sec);
> -			ring_print(&blt_ring, last_samples_per_sec);
> -
> -			printf("\n%30s  %s\n", "task", "percent busy");
> -			for (i = 0; i < max_lines; i++) {
> -				if (top_bits_sorted[i]->count > 0) {
> -					percent = (top_bits_sorted[i]->count * 100) /
> -						last_samples_per_sec;
> -					len = printf("%30s: %3d%%: ",
> -							 top_bits_sorted[i]->bit->name,
> -							 percent);
> -					print_percentage_bar (percent, len);
> -				} else {
> -					printf("%*s", PERCENTAGE_BAR_END, "");
> -				}
> -
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					printf("%13s: %llu (%lld/sec)",
> -						   stats_reg_names[i],
> -						   (long long)stats[i],
> -						   (long long)(stats[i] - last_stats[i]));
> -					last_stats[i] = stats[i];
> -				} else {
> -					if (!top_bits_sorted[i]->count)
> -						break;
> -				}
> +			if (++lines < con_h)
>   				printf("\n");
> -			}
>   		}
> -		if (output) {
> -			/* Print headers for columns at first run */
> -			if (print_headers) {
> -				fprintf(output, "# time\t");
> -				ring_print_header(output, &render_ring);
> -				ring_print_header(output, &bsd_ring);
> -				ring_print_header(output, &bsd6_ring);
> -				ring_print_header(output, &blt_ring);
> -				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -						fprintf(output, "%.6s\t",
> -							   stats_reg_names[i]
> -							   );
> -					}
> -					if (!top_bits[i].count)
> -						continue;
> -				}
> -				fprintf(output, "\n");
> -				print_headers = 0;
> -			}
>   
> -			/* Print statistics */
> -			fprintf(output, "%.2f\t", elapsed_time);
> -			ring_log(&render_ring, last_samples_per_sec, output);
> -			ring_log(&bsd_ring, last_samples_per_sec, output);
> -			ring_log(&bsd6_ring, last_samples_per_sec, output);
> -			ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					fprintf(output, "%"PRIu64"\t",
> -						   stats[i] - last_stats[i]);
> -					last_stats[i] = stats[i];
> -				}
> -					if (!top_bits[i].count)
> -						continue;
> -			}
> -			fprintf(output, "\n");
> -			fflush(output);
> -		}
> +		for (i = 0; i < engines->num_engines; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
>   
> -		for (i = 0; i < num_instdone_bits; i++) {
> -			top_bits_sorted[i]->count = 0;
> +			if (engine->num_counters && lines < con_h) {
> +				const char *a = "          ENGINE      BUSY ";
> +				const char *b = " MI_SEMA MI_WAIT";
>   
> -			if (i < STATS_COUNT)
> -				last_stats[i] = stats[i];
> +				printf("\033[7m%s%*s%s\033[0m\n",
> +				       a,
> +				       (int)(con_w - 1 - strlen(a) - strlen(b)),
> +				       " ", b);
> +				lines++;
> +				break;
> +			}
>   		}
>   
> -		/* Check if child has gone */
> -		if (child_pid > 0) {
> -			int res;
> -			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -				perror("waitpid");
> -				exit(1);
> -			}
> -			if (res == 0)
> +		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
> +			unsigned int max_w = con_w - 1;
> +			unsigned int len;
> +			char sema[BUFSZ];
> +			char wait[BUFSZ];
> +			char busy[BUFSZ];
> +			char buf[128];
> +			double val;
> +
> +			if (!engine->num_counters)
>   				continue;
> -			if (WIFEXITED(child_stat))
> -				break;
> +
> +			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
> +			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
> +			len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
> +				       sema, wait);
> +
> +			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
> +				 100);
> +			len += printf("%16s %s%% ", engine->display_name, busy);
> +
> +			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
> +			print_percentage_bar(val, max_w - len);
> +
> +			printf("%s\n", buf);
> +
> +			lines++;
>   		}
> -	}
>   
> -	fclose(output);
> +		if (lines++ < con_h)
> +			printf("\n");
> +
> +		usleep(period_us);
> +	}
>   
> -	intel_register_access_fini();
>   	return 0;
>   }
> diff --git a/tools/meson.build b/tools/meson.build
> index bd2d313d5156..a918eeb0bef1 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -23,7 +23,6 @@ tools_progs = [
>   	'intel_gpu_frequency',
>   	'intel_firmware_decode',
>   	'intel_gpu_time',
> -	'intel_gpu_top',
>   	'intel_gtt',
>   	'intel_guc_logger',
>   	'intel_infoframes',
> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>   	       name_prefix : '',
>   	       install : true)
>   
> +executable('intel_gpu_top', 'intel_gpu_top.c',
> +	   install : true,
> +	   install_rpath : rpathdir,
> +	   dependencies : tool_deps + [ lib_igt_perf ])
> +
>   conf_data = configuration_data()
>   conf_data.set('prefix', prefix)
>   conf_data.set('exec_prefix', '${prefix}')
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v6] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-04-09 12:26       ` Tvrtko Ursulin
  0 siblings, 0 replies; 57+ messages in thread
From: Tvrtko Ursulin @ 2018-04-09 12:26 UTC (permalink / raw)
  To: Tvrtko Ursulin, igt-dev
  Cc: Tvrtko Ursulin, Eero Tamminen, Ben Widawsky, Intel-gfx


[Adding some people to Cc for more ack/nack type feedback.]

Executive question is ack or nack on replacing intel_gpu_top with a new 
implementation which uses only perf PMU for counter gathering.

A short history on how this came to be:

There was a recent external patch contribution from Rinat Ibragimov to 
support more platforms from the existing intel_gpu_top. But as the tool 
is not safe to use Chris Wilson suggested to maybe just replace it.

As it happens I had a good start to do this quickly and cheaply, in the 
form of one prototype I did recently, which only needed ripping some 
bits out, and polishing the rest.

Eero and Rinat kindly did a lot of platform coverage testing and the 
rewrite seems ready for next steps.

I need to stress that as the commit notes, the new tool has a slightly 
different scope as that it doesn't expose GPU functional level data, but 
only overall stats like power, frequencies, RC6, interrupts, IMC memory 
bandwidth and per command streamer busyness, mi_semaphore and mi_event 
waits. My thinking was that for more functional level profiling gpu-top 
(OA) should be used.

Also the "run a command" and CSV output features are not not supported 
since both can be done directly via perf stat.

Regards,

Tvrtko

On 04/04/2018 16:26, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
> 
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
> 
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
> 
> v2:
>   * Sort engines by class and instance.
>   * Do not wait for one sampling period to display something on screen.
>   * Move code out of the asserts. (Rinat Ibragimov)
>   * Continuously adapt to terminal size. (Rinat Ibragimov)
> 
> v3:
>   * Change layout and precision of some field. (Chris Wilson)
>   Eero Tamminen:
>   * Use more user friendly engine names.
>   * Don't error out if a counter is missing.
>   * Add IMC read/write bandwidth.
>   * Report minimum required kernel version.
> 
> v4:
>   * Really support 4.16 by skipping of missing engines.
>   * Simpler and less hacky float printing.
>   * Preserve copyright header. (Antonio Argenziano)
>   * Simplify engines_ptr macro. (Rinat Ibragimov)
> 
> v5:
>   * Get RAPL unit from sysfs.
>   * Consolidate sysfs paths with a macro.
>   * Tidy error handling by carrying over and reporting errno.
>   * Check against console height on all prints.
>   * More readable minimum kernel version message. (Eero Tamminen)
>   * Column banner for per engine stats. (Eero Tamminen)
> 
> v6:
>   * Man page update. (Eero Tamminen)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
> ---
>   lib/igt_perf.c        |    6 +
>   lib/igt_perf.h        |    1 +
>   man/intel_gpu_top.rst |   41 +-
>   tools/Makefile.am     |    2 +
>   tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
>   tools/meson.build     |    6 +-
>   6 files changed, 719 insertions(+), 587 deletions(-)
> 
> diff --git a/lib/igt_perf.c b/lib/igt_perf.c
> index 99d82ea51c9b..e3dec2cc29c7 100644
> --- a/lib/igt_perf.c
> +++ b/lib/igt_perf.c
> @@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
>   	return _perf_open(type, config, -1,
>   			  PERF_FORMAT_TOTAL_TIME_ENABLED);
>   }
> +
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group)
> +{
> +	return _perf_open(type, config, group,
> +			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
> +}
> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
> index 614ea5d23fa6..e00718f4769a 100644
> --- a/lib/igt_perf.h
> +++ b/lib/igt_perf.h
> @@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
>   int perf_i915_open(uint64_t config);
>   int perf_i915_open_group(uint64_t config, int group);
>   int igt_perf_open(uint64_t type, uint64_t config);
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group);
>   
>   #endif /* I915_PERF_H */
> diff --git a/man/intel_gpu_top.rst b/man/intel_gpu_top.rst
> index a5f7175bb1a0..19c712307d28 100644
> --- a/man/intel_gpu_top.rst
> +++ b/man/intel_gpu_top.rst
> @@ -7,9 +7,9 @@ Display a top-like summary of Intel GPU usage
>   ---------------------------------------------
>   .. include:: defs.rst
>   :Author: IGT Developers <igt-dev@lists.freedesktop.org>
> -:Date: 2016-03-01
> +:Date: 2018-04-04
>   :Version: |PACKAGE_STRING|
> -:Copyright: 2009,2011,2012,2016 Intel Corporation
> +:Copyright: 2009,2011,2012,2016,2018 Intel Corporation
>   :Manual section: |MANUAL_SECTION|
>   :Manual group: |MANUAL_GROUP|
>   
> @@ -21,42 +21,25 @@ SYNOPSIS
>   DESCRIPTION
>   ===========
>   
> -**intel_gpu_top** is a tool to display usage information of an Intel GPU. It
> -requires root privilege to map the graphics device.
> +**intel_gpu_top** is a tool to display usage information on Intel GPU's.
> +
> +The tool gathers data using perf performance counters (PMU) exposed by i915 and other platform drivers like RAPL (power) and Uncore IMC (memory bandwidth).
>   
>   OPTIONS
>   =======
>   
> --s SAMPLES
> -    Number of samples to acquire per second.
> -
> --o FILE
> -    Collect usage statistics to FILE. If file is "-", run non-interactively
> -    and output statistics to stdout.
> -
> --e COMMAND
> -    Execute COMMAND to profile, and leave when it is finished. Note that the
> -    entire command with all parameters should be included as one parameter.
> +-s <ms>
> +    Refresh period in milliseconds.
>   
>   -h
> -    Show usage notes.
> +    Show help text.
>   
> -EXAMPLES
> -========
> -
> -intel_gpu_top -o "cairo-trace-gvim.log" -s 100 -e "cairo-perf-trace /tmp/gvim"
> -    Run cairo-perf-trace with /tmp/gvim trace, non-interactively, saving the
> -    statistics into cairo-trace-gvim.log file, and collecting 100 samples per
> -    second.
> -
> -Note that idle units are not displayed, so an entirely idle GPU will only
> -display the ring status and header.
> +LIMITATIONS
> +===========
>   
> -BUGS
> -====
> +* Not all metrics are supported on all platforms. Where a metric is unsupported it's value will be replaced by a dashed line.
>   
> -Some GPUs report some units as busy when they aren't, such that even when idle
> -and not hung, it will show up as 100% busy.
> +* Non-root access to perf counters is controlled by the *perf_event_paranoid* sysctl.
>   
>   REPORTING BUGS
>   ==============
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>   intel_aubdump_la_SOURCES = aubdump.c
>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>   
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>   bin_SCRIPTS = intel_aubdump
>   CLEANFILES = $(bin_SCRIPTS)
>   
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..b923c3cfbe97 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>   /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2007-2018 Intel Corporation
>    *
>    * Permission is hereby granted, free of charge, to any person obtaining a
>    * copy of this software and associated documentation files (the "Software"),
> @@ -24,695 +23,832 @@
>    * Authors:
>    *    Eric Anholt <eric@anholt.net>
>    *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
>    */
>   
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>   #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>   #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE	    0xA18C
> -#define  FORCEWAKE_ACK	    0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)		IS_965(devid)
> -
> -struct top_bit {
> -	struct instdone_bit *bit;
> -	int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -	" ",
> -	"▏",
> -	"▎",
> -	"▍",
> -	"▌",
> -	"▋",
> -	"▊",
> -	"▉",
> -	"█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>   
> -enum stats_counts {
> -	IA_VERTICES,
> -	IA_PRIMITIVES,
> -	VS_INVOCATION,
> -	GS_INVOCATION,
> -	GS_PRIMITIVES,
> -	CL_INVOCATION,
> -	CL_PRIMITIVES,
> -	PS_INVOCATION,
> -	PS_DEPTH,
> -	STATS_COUNT
> +struct pmu_pair {
> +	uint64_t cur;
> +	uint64_t prev;
>   };
>   
> -const uint32_t stats_regs[STATS_COUNT] = {
> -	IA_VERTICES_COUNT_QW,
> -	IA_PRIMITIVES_COUNT_QW,
> -	VS_INVOCATION_COUNT_QW,
> -	GS_INVOCATION_COUNT_QW,
> -	GS_PRIMITIVES_COUNT_QW,
> -	CL_INVOCATION_COUNT_QW,
> -	CL_PRIMITIVES_COUNT_QW,
> -	PS_INVOCATION_COUNT_QW,
> -	PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +	bool present;
> +	uint64_t config;
> +	unsigned int idx;
> +	struct pmu_pair val;
>   };
>   
> -const char *stats_reg_names[STATS_COUNT] = {
> -	"vert fetch",
> -	"prim fetch",
> -	"VS invocations",
> -	"GS invocations",
> -	"GS prims",
> -	"CL invocations",
> -	"CL prims",
> -	"PS invocations",
> -	"PS depth pass",
> +struct engine {
> +	const char *name;
> +	const char *display_name;
> +
> +	unsigned int class;
> +	unsigned int instance;
> +
> +	unsigned int num_counters;
> +
> +	struct pmu_counter busy;
> +	struct pmu_counter wait;
> +	struct pmu_counter sema;
>   };
>   
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +	unsigned int num_engines;
> +	unsigned int num_counters;
> +	DIR *root;
> +	int fd;
> +	struct pmu_pair ts;
> +
> +	int rapl_fd;
> +	double rapl_scale;
> +	const char *rapl_unit;
> +
> +	int imc_fd;
> +	double imc_reads_scale;
> +	const char *imc_reads_unit;
> +	double imc_writes_scale;
> +	const char *imc_writes_unit;
> +
> +	struct pmu_counter freq_req;
> +	struct pmu_counter freq_act;
> +	struct pmu_counter irq;
> +	struct pmu_counter rc6;
> +	struct pmu_counter rapl;
> +	struct pmu_counter imc_reads;
> +	struct pmu_counter imc_writes;
> +
> +	struct engine engine;
> +};
>   
> -static unsigned long
> -gettime(void)
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>   {
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +	char buf[128], *p;
> +	int fd, ret;
>   
> -static int
> -top_bits_sort(const void *a, const void *b)
> -{
> -	struct top_bit * const *bit_a = a;
> -	struct top_bit * const *bit_b = b;
> -	int a_count = (*bit_a)->count;
> -	int b_count = (*bit_b)->count;
> +	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +	if (ret < 0 || ret == sizeof(buf))
> +		return -1;
>   
> -	if (a_count < b_count)
> -		return 1;
> -	else if (a_count == b_count)
> -		return 0;
> -	else
> +	fd = openat(dirfd, buf, O_RDONLY);
> +	if (fd < 0)
>   		return -1;
> -}
>   
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -	uint32_t reg_val;
> +	ret = read(fd, buf, sizeof(buf));
> +	close(fd);
> +	if (ret <= 0)
> +		return -1;
>   
> -	if (top_bit->bit->reg == INSTDONE_1)
> -		reg_val = instdone1;
> -	else
> -		reg_val = instdone;
> +	p = index(buf, '0');
> +	if (!p)
> +		return -1;
>   
> -	if ((reg_val & top_bit->bit->bit) == 0)
> -		top_bit->count++;
> +	return strtoul(p, NULL, 0);
>   }
>   
> -static void
> -print_clock(const char *name, int clock) {
> -	if (clock == -1)
> -		printf("%s clock: unknown", name);
> +#define engine_ptr(engines, n) (&engines->engine + (n))
> +
> +static const char *class_display_name(unsigned int class)
> +{
> +	switch (class) {
> +	case I915_ENGINE_CLASS_RENDER:
> +		return "Render/3D";
> +	case I915_ENGINE_CLASS_COPY:
> +		return "Blitter";
> +	case I915_ENGINE_CLASS_VIDEO:
> +		return "Video";
> +	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
> +		return "VideoEnhance";
> +	default:
> +		return "[unknown]";
> +	}
> +}
> +
> +static int engine_cmp(const void *__a, const void *__b)
> +{
> +	const struct engine *a = (struct engine *)__a;
> +	const struct engine *b = (struct engine *)__b;
> +
> +	if (a->class != b->class)
> +		return a->class - b->class;
>   	else
> -		printf("%s clock: %d Mhz", name, clock);
> +		return a->instance - b->instance;
>   }
>   
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +static struct engines *discover_engines(void)
>   {
> -	uint32_t devid = pci_dev->device_id;
> -	uint16_t gcfgc;
> +	const char *sysfs_root = "/sys/devices/i915/events";
> +	struct engines *engines;
> +	struct dirent *dent;
> +	int ret = 0;
> +	DIR *d;
>   
> -	if (IS_GM45(devid)) {
> -		int core_clock = -1;
> +	engines = malloc(sizeof(struct engines));
> +	if (!engines)
> +		return NULL;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	memset(engines, 0, sizeof(*engines));
>   
> -		switch (gcfgc & 0xf) {
> -		case 8:
> -			core_clock = 266;
> -			break;
> -		case 9:
> -			core_clock = 320;
> -			break;
> -		case 11:
> -			core_clock = 400;
> -			break;
> -		case 13:
> -			core_clock = 533;
> -			break;
> -		}
> -		print_clock("core", core_clock);
> -	} else if (IS_965(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, sampler_clock = -1;
> +	engines->num_engines = 0;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	d = opendir(sysfs_root);
> +	if (!d)
> +		return NULL;
>   
> -		switch (gcfgc & 0xf) {
> -		case 2:
> -			render_clock = 250; sampler_clock = 267;
> -			break;
> -		case 3:
> -			render_clock = 320; sampler_clock = 333;
> -			break;
> -		case 4:
> -			render_clock = 400; sampler_clock = 444;
> -			break;
> -		case 5:
> -			render_clock = 500; sampler_clock = 533;
> +	while ((dent = readdir(d)) != NULL) {
> +		const char *endswith = "-busy";
> +		const unsigned int endlen = strlen(endswith);
> +		struct engine *engine =
> +				engine_ptr(engines, engines->num_engines);
> +		char buf[256];
> +
> +		if (dent->d_type != DT_REG)
> +			continue;
> +
> +		if (strlen(dent->d_name) >= sizeof(buf)) {
> +			ret = ENAMETOOLONG;
>   			break;
>   		}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("sampler", sampler_clock);
> -	} else if (IS_945(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		strcpy(buf, dent->d_name);
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		/* xxxN-busy */
> +		if (strlen(buf) < (endlen + 4))
> +			continue;
> +		if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +			continue;
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 166;
> -			break;
> -		case 1:
> -			render_clock = 200;
> -			break;
> -		case 3:
> -			render_clock = 250;
> -			break;
> -		case 5:
> -			render_clock = 400;
> +		memset(engine, 0, sizeof(*engine));
> +
> +		buf[strlen(buf) - endlen] = 0;
> +		engine->name = strdup(buf);
> +		if (!engine->name) {
> +			ret = errno;
>   			break;
>   		}
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 200;
> -			break;
> -		case 4:
> -			display_clock = 320;
> +		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +						     "busy");
> +		if (engine->busy.config == -1) {
> +			ret = ENOENT;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	} else if (IS_915(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		engine->class = (engine->busy.config &
> +				 (__I915_PMU_OTHER(0) - 1)) >>
> +				I915_PMU_CLASS_SHIFT;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		engine->instance = (engine->busy.config >>
> +				    I915_PMU_SAMPLE_BITS) &
> +				    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 160;
> -			break;
> -		case 1:
> -			render_clock = 190;
> -			break;
> -		case 4:
> -			render_clock = 333;
> +		ret = snprintf(buf, sizeof(buf), "%s/%u",
> +			       class_display_name(engine->class),
> +			       engine->instance);
> +		if (ret < 0 || ret == sizeof(buf)) {
> +			ret = ENOBUFS;
>   			break;
>   		}
> -		if (gcfgc & (1 << 13))
> -		    render_clock = 133;
> +		ret = 0;
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 190;
> +		engine->display_name = strdup(buf);
> +		if (!engine->display_name) {
> +			ret = errno;
>   			break;
> -		case 4:
> -			display_clock = 333;
> +		}
> +
> +		engines->num_engines++;
> +		engines = realloc(engines, sizeof(struct engines) +
> +				  engines->num_engines * sizeof(struct engine));
> +		if (!engines) {
> +			ret = errno;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	}
> +
> +	if (ret) {
> +		free(engines);
> +		errno = ret;
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> +		return NULL;
>   	}
>   
> +	qsort(engine_ptr(engines, 0), engines->num_engines,
> +	      sizeof(struct engine), engine_cmp);
> +
> +	engines->root = d;
>   
> -	printf("\n");
> -	return -1;
> +	return engines;
>   }
>   
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +	int fd, err;
> +	ssize_t ret;
>   
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +	fd = open(filename, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
> +
> +	ret = read(fd, buf, bufsize - 1);
> +	err = errno;
> +	close(fd);
> +	if (ret < 1) {
> +		errno = ret < 0 ? err : ENOMSG;
> +
> +		return -1;
> +	}
> +
> +	if (ret > 1 && buf[ret - 1] == '\n')
> +		buf[ret - 1] = '\0';
> +	else
> +		buf[ret] = '\0';
> +
> +	return 0;
> +}
> +
> +static uint64_t filename_to_u64(const char *filename, int base)
>   {
> -	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -	int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -	int i;
> +	char buf[64], *b;
>   
> -	for (i = bar_len; i >= 8; i -= 8) {
> -		printf("%s", bars[8]);
> -		cur_line_len++;
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	/*
> +	 * Handle both single integer and key=value formats by skipping
> +	 * leading non-digits.
> +	 */
> +	b = buf;
> +	while (*b && !isdigit(*b))
> +		b++;
> +
> +	return strtoull(b, NULL, base);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +	char *oldlocale;
> +	char buf[80];
> +	double v;
> +
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	oldlocale = setlocale(LC_ALL, "C");
> +	v = strtod(buf, NULL);
> +	setlocale(LC_ALL, oldlocale);
> +
> +	return v;
> +}
> +
> +#define RAPL_ROOT "/sys/devices/power/"
> +#define RAPL_EVENT "/sys/devices/power/events/"
> +
> +static uint64_t rapl_type_id(void)
> +{
> +	return filename_to_u64(RAPL_ROOT "type", 10);
> +}
> +
> +static uint64_t rapl_gpu_power(void)
> +{
> +	return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
> +}
> +
> +static double rapl_gpu_power_scale(void)
> +{
> +	return filename_to_double(RAPL_EVENT "energy-gpu.scale");
> +}
> +
> +static const char *rapl_gpu_power_unit(void)
> +{
> +	char buf[32];
> +
> +	if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
> +			    buf, sizeof(buf)) == 0)
> +		if (!strcmp(buf, "Joules"))
> +			return strdup("Watts");
> +		else
> +			return strdup(buf);
> +	else
> +		return NULL;
> +}
> +
> +#define IMC_ROOT "/sys/devices/uncore_imc/"
> +#define IMC_EVENT "/sys/devices/uncore_imc/events/"
> +
> +static uint64_t imc_type_id(void)
> +{
> +	return filename_to_u64(IMC_ROOT "type", 10);
> +}
> +
> +static uint64_t imc_data_reads(void)
> +{
> +	return filename_to_u64(IMC_EVENT "data_reads", 0);
> +}
> +
> +static double imc_data_reads_scale(void)
> +{
> +	return filename_to_double(IMC_EVENT "data_reads.scale");
> +}
> +
> +static const char *imc_data_reads_unit(void)
> +{
> +	char buf[32];
> +
> +	if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
> +		return strdup(buf);
> +	else
> +		return NULL;
> +}
> +
> +static uint64_t imc_data_writes(void)
> +{
> +	return filename_to_u64(IMC_EVENT "data_writes", 0);
> +}
> +
> +static double imc_data_writes_scale(void)
> +{
> +	return filename_to_double(IMC_EVENT "data_writes.scale");
> +}
> +
> +static const char *imc_data_writes_unit(void)
> +{
> +	char buf[32];
> +
> +	if (filename_to_buf(IMC_EVENT "data_writes.unit",
> +			    buf, sizeof(buf)) == 0)
> +		return strdup(buf);
> +	else
> +		return NULL;
> +}
> +
> +#define _open_pmu(cnt, pmu, fd) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = perf_i915_open_group((pmu)->config, (fd)); \
> +	if (fd__ >= 0) { \
> +		if ((fd) == -1) \
> +			(fd) = fd__; \
> +		(pmu)->present = true; \
> +		(pmu)->idx = (cnt)++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +#define _open_imc(cnt, pmu, fd) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
> +	if (fd__ >= 0) { \
> +		if ((fd) == -1) \
> +			(fd) = fd__; \
> +		(pmu)->present = true; \
> +		(pmu)->idx = (cnt)++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
> +{
> +	unsigned int i;
> +	int fd;
> +
> +	engines->fd = -1;
> +	engines->num_counters = 0;
> +
> +	engines->irq.config = I915_PMU_INTERRUPTS;
> +	fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +	_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
> +
> +	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +	_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
> +
> +	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +	_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +		struct {
> +			struct pmu_counter *pmu;
> +			const char *counter;
> +		} *cnt, counters[] = {
> +			{ .pmu = &engine->busy, .counter = "busy" },
> +			{ .pmu = &engine->wait, .counter = "wait" },
> +			{ .pmu = &engine->sema, .counter = "sema" },
> +			{ .pmu = NULL, .counter = NULL },
> +		};
> +
> +		for (cnt = counters; cnt->pmu; cnt++) {
> +			if (!cnt->pmu->config)
> +				cnt->pmu->config =
> +					get_pmu_config(dirfd(engines->root),
> +						       engine->name,
> +						       cnt->counter);
> +			fd = _open_pmu(engines->num_counters, cnt->pmu,
> +				       engines->fd);
> +			if (fd >= 0)
> +				engine->num_counters++;
> +		}
>   	}
> -	if (i) {
> -		printf("%s", bars[i]);
> -		cur_line_len++;
> +
> +	engines->rapl_fd = -1;
> +	if (rapl_type_id()) {
> +		engines->rapl_scale = rapl_gpu_power_scale();
> +		engines->rapl_unit = rapl_gpu_power_unit();
> +		if (!engines->rapl_unit)
> +			return -1;
> +
> +		engines->rapl.config = rapl_gpu_power();
> +		if (!engines->rapl.config)
> +			return -1;
> +
> +		engines->rapl_fd = igt_perf_open(rapl_type_id(),
> +						 engines->rapl.config);
> +		if (engines->rapl_fd < 0)
> +			return -1;
> +
> +		engines->rapl.present = true;
>   	}
>   
> -	/* NB: We can't use a field width with utf8 so we manually
> -	* guarantee a field with of 45 chars for any bar. */
> -	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> -}
> +	engines->imc_fd = -1;
> +	if (imc_type_id()) {
> +		unsigned int num = 0;
>   
> -struct ring {
> -	const char *name;
> -	uint32_t mmio;
> -	int head, tail, size;
> -	uint64_t full;
> -	int idle;
> -};
> +		engines->imc_reads_scale = imc_data_reads_scale();
> +		engines->imc_writes_scale = imc_data_writes_scale();
> +
> +		engines->imc_reads_unit = imc_data_reads_unit();
> +		if (!engines->imc_reads_unit)
> +			return -1;
> +
> +		engines->imc_writes_unit = imc_data_writes_unit();
> +		if (!engines->imc_writes_unit)
> +			return -1;
> +
> +		engines->imc_reads.config = imc_data_reads();
> +		if (!engines->imc_reads.config)
> +			return -1;
> +
> +		engines->imc_writes.config = imc_data_writes();
> +		if (!engines->imc_writes.config)
> +			return -1;
> +
> +		fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
> +		if (fd < 0)
> +			return -1;
> +		fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
> +		if (fd < 0)
> +			return -1;
> +
> +		engines->imc_reads.present = true;
> +		engines->imc_writes.present = true;
> +	}
> +
> +	return 0;
> +}
>   
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>   {
> -	return INREG(ring->mmio + reg);
> +	uint64_t buf[2 + num];
> +	unsigned int i;
> +	ssize_t len;
> +
> +	memset(buf, 0, sizeof(buf));
> +
> +	len = read(fd, buf, sizeof(buf));
> +	assert(len == sizeof(buf));
> +
> +	for (i = 0; i < num; i++)
> +		val[i] = buf[2 + i];
> +
> +	return buf[1];
>   }
>   
> -static void ring_init(struct ring *ring)
> +static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
>   {
> -	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +	double v;
> +
> +	v = p->cur - p->prev;
> +	v /= d;
> +	v /= t;
> +	v *= s;
> +
> +	if (s == 100.0 && v > 100.0)
> +		v = 100.0;
> +
> +	return v;
>   }
>   
> -static void ring_reset(struct ring *ring)
> +static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
>   {
> -	ring->idle = ring->full = 0;
> +	unsigned int i;
> +
> +	for (i = 0; i < num && i < (bufsz - 1); i++)
> +		*buf++ = c;
> +
> +	*buf = 0;
>   }
>   
> -static void ring_sample(struct ring *ring)
> +static void pmu_calc(struct pmu_counter *cnt,
> +		     char *buf, unsigned int bufsz,
> +		     unsigned int width, unsigned width_dec,
> +		     double d, double t, double s)
>   {
> -	int full;
> +	double val;
> +	int len;
> +
> +	assert(bufsz >= (width + width_dec + 1));
> +
> +	if (!cnt->present) {
> +		fill_str(buf, bufsz, '-', width + width_dec);
> +		return;
> +	}
>   
> -	if (!ring->size)
> +	val = __pmu_calc(&cnt->val, d, t, s);
> +
> +	len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
> +	if (len < 0 || len == bufsz) {
> +		fill_str(buf, bufsz, 'X', width + width_dec);
>   		return;
> +	}
> +}
> +
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
> +{
> +	uint64_t data[2] = { };
> +	ssize_t len;
>   
> -	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +	len = read(fd, data, sizeof(data));
> +	assert(len == sizeof(data));
>   
> -	if (ring->tail == ring->head)
> -		ring->idle++;
> +	if (ts)
> +		*ts = data[1];
>   
> -	full = ring->tail - ring->head;
> -	if (full < 0)
> -		full += ring->size;
> -	ring->full += full;
> +	return data[0];
>   }
>   
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static uint64_t pmu_read_single(int fd)
>   {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +	return __pmu_read_single(fd, NULL);
>   }
>   
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>   {
> -	int percent_busy, len;
> +	counter->val.prev = counter->val.cur;
> +	counter->val.cur = val;
> +}
>   
> -	if (!ring->size)
> -		return;
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
> +{
> +	if (counter->present)
> +		__update_sample(counter, val[counter->idx]);
> +}
> +
> +static void pmu_sample(struct engines *engines)
> +{
> +	const int num_val = engines->num_counters;
> +	uint64_t val[2 + num_val];
> +	unsigned int i;
> +
> +	engines->ts.prev = engines->ts.cur;
> +
> +	if (engines->rapl_fd >= 0)
> +		__update_sample(&engines->rapl,
> +				pmu_read_single(engines->rapl_fd));
> +
> +	if (engines->imc_fd >= 0) {
> +		pmu_read_multi(engines->imc_fd, 2, val);
> +		update_sample(&engines->imc_reads, val);
> +		update_sample(&engines->imc_writes, val);
> +	}
>   
> -	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>   
> -	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -	print_percentage_bar (percent_busy, len);
> -	printf("%24s space: %d/%d\n",
> -		   ring->name,
> -		   (int)(ring->full / samples_per_sec),
> -		   ring->size);
> +	update_sample(&engines->freq_req, val);
> +	update_sample(&engines->freq_act, val);
> +	update_sample(&engines->irq, val);
> +	update_sample(&engines->rc6, val);
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +
> +		update_sample(&engine->busy, val);
> +		update_sample(&engine->sema, val);
> +		update_sample(&engine->wait, val);
> +	}
>   }
>   
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -		FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>   {
> -	if (ring->size)
> -		fprintf(output, "%3d\t%d\t",
> -			(int)(100 - 100 * ring->idle / samples_per_sec),
> -			(int)(ring->full / samples_per_sec));
> -	else
> -		fprintf(output, "-1\t-1\t");
> +	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +	int i;
> +
> +	putchar('|');
> +
> +	for (i = bar_len; i >= 8; i -= 8)
> +		printf("%s", bars[8]);
> +	if (i)
> +		printf("%s", bars[i]);
> +
> +	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +		putchar(' ');
> +
> +	putchar('|');
>   }
>   
> +#define DEFAULT_PERIOD_MS (1000)
> +
>   static void
>   usage(const char *appname)
>   {
>   	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -			"\n"
> -			"usage: %s [parameters]\n"
> -			"\n"
> -			"The following parameters apply:\n"
> -			"[-s <samples>]       samples per seconds (default %d)\n"
> -			"[-e <command>]       command to profile\n"
> -			"[-o <file>]          output statistics to file. If file is '-',"
> -			"                     run in batch mode and output statistics to stdio only \n"
> -			"[-h]                 show this help screen\n"
> -			"\n",
> -			appname,
> -			SAMPLES_PER_SEC
> -		  );
> -	return;
> +		"\n"
> +		"Usage: %s [parameters]\n"
> +		"\n"
> +		"\tThe following parameters are optional:\n\n"
> +		"\t[-s <ms>]       Refresh period in milliseconds (default %ums).\n"
> +		"\t[-h]            Show this help text.\n"
> +		"\n",
> +		appname, DEFAULT_PERIOD_MS);
>   }
>   
>   int main(int argc, char **argv)
>   {
> -	uint32_t devid;
> -	struct pci_device *pci_dev;
> -	struct ring render_ring = {
> -		.name = "render",
> -		.mmio = 0x2030,
> -	}, bsd_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x4030,
> -	}, bsd6_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x12030,
> -	}, blt_ring = {
> -		.name = "blitter",
> -		.mmio = 0x22030,
> -	};
> -	int i, ch;
> -	int samples_per_sec = SAMPLES_PER_SEC;
> -	FILE *output = NULL;
> -	double elapsed_time=0;
> -	int print_headers=1;
> -	pid_t child_pid=-1;
> -	int child_stat;
> -	char *cmd=NULL;
> -	int interactive=1;
> -
> -	/* Parse options? */
> -	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +	int con_w = -1, con_h = -1;
> +	struct engines *engines;
> +	unsigned int i;
> +	int ret, ch;
> +
> +	/* Parse options */
> +	while ((ch = getopt(argc, argv, "s:h")) != -1) {
>   		switch (ch) {
> -		case 'e': cmd = strdup(optarg);
> -			break;
> -		case 's': samples_per_sec = atoi(optarg);
> -			if (samples_per_sec < 100) {
> -				fprintf(stderr, "Error: samples per second must be >= 100\n");
> -				exit(1);
> -			}
> -			break;
> -		case 'o':
> -			if (!strcmp(optarg, "-")) {
> -				/* Running in non-interactive mode */
> -				interactive = 0;
> -				output = stdout;
> -			}
> -			else
> -				output = fopen(optarg, "w");
> -			if (!output)
> -			{
> -				perror("fopen");
> -				exit(1);
> -			}
> +		case 's':
> +			period_us = atoi(optarg) * 1000;
>   			break;
>   		case 'h':
>   			usage(argv[0]);
>   			exit(0);
> -			break;
>   		default:
> -			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>   			usage(argv[0]);
>   			exit(1);
> -			break;
>   		}
>   	}
>   
> -	pci_dev = intel_get_pci_device();
> -	devid = pci_dev->device_id;
> -	intel_mmio_use_pci_bar(pci_dev);
> -	init_instdone_definitions(devid);
> -
> -	/* Do we have a command to run? */
> -	if (cmd != NULL) {
> -		if (output) {
> -			fprintf(output, "# Profiling: %s\n", cmd);
> -			fflush(output);
> -		}
> -		child_pid = fork();
> -		if (child_pid < 0) {
> -			perror("fork");
> -			exit(1);
> -		}
> -		else if (child_pid == 0) {
> -			int res;
> -			res = system(cmd);
> -			if (res < 0)
> -				perror("running command");
> -			if (output) {
> -				fflush(output);
> -				fprintf(output, "# %s exited with status %d\n", cmd, res);
> -				fflush(output);
> -			}
> -			free(cmd);
> -			exit(0);
> -		} else {
> -			free(cmd);
> -		}
> +	engines = discover_engines();
> +	if (!engines) {
> +		fprintf(stderr,
> +			"Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
> +			strerror(errno));
> +		return 1;
>   	}
>   
> -	for (i = 0; i < num_instdone_bits; i++) {
> -		top_bits[i].bit = &instdone_bits[i];
> -		top_bits[i].count = 0;
> -		top_bits_sorted[i] = &top_bits[i];
> +	ret = pmu_init(engines);
> +	if (ret) {
> +		fprintf(stderr,
> +			"Failed to initialize PMU! (%s)\n", strerror(errno));
> +		return 1;
>   	}
>   
> -	/* Grab access to the registers */
> -	intel_register_access_init(pci_dev, 0, -1);
> +	pmu_sample(engines);
>   
> -	ring_init(&render_ring);
> -	if (IS_GEN4(devid) || IS_GEN5(devid))
> -		ring_init(&bsd_ring);
> -	if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -		ring_init(&bsd6_ring);
> -		ring_init(&blt_ring);
> -	}
> +	for (;;) {
> +		double t;
> +#define BUFSZ 16
> +		char freq[BUFSZ];
> +		char fact[BUFSZ];
> +		char irq[BUFSZ];
> +		char rc6[BUFSZ];
> +		char power[BUFSZ];
> +		char reads[BUFSZ];
> +		char writes[BUFSZ];
> +		struct winsize ws;
> +		int lines = 0;
>   
> -	/* Initialize GPU stats */
> -	if (HAS_STATS_REGS(devid)) {
> -		for (i = 0; i < STATS_COUNT; i++) {
> -			uint32_t stats_high, stats_low, stats_high_2;
> +		/* Update terminal size. */
> +		if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +			con_w = ws.ws_col;
> +			con_h = ws.ws_row;
> +		}
>   
> -			do {
> -				stats_high = INREG(stats_regs[i] + 4);
> -				stats_low = INREG(stats_regs[i]);
> -				stats_high_2 = INREG(stats_regs[i] + 4);
> -			} while (stats_high != stats_high_2);
> +		pmu_sample(engines);
> +		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>   
> -			last_stats[i] = (uint64_t)stats_high << 32 |
> -				stats_low;
> -		}
> -	}
> +		printf("\033[H\033[J");
>   
> -	for (;;) {
> -		int j;
> -		unsigned long long t1, ti, tf, t2;
> -		unsigned long long def_sleep = 1000000 / samples_per_sec;
> -		unsigned long long last_samples_per_sec = samples_per_sec;
> -		unsigned short int max_lines;
> -		struct winsize ws;
> -		char clear_screen[] = {0x1b, '[', 'H',
> -				       0x1b, '[', 'J',
> -				       0x0};
> -		int percent;
> -		int len;
> -
> -		t1 = gettime();
> -
> -		ring_reset(&render_ring);
> -		ring_reset(&bsd_ring);
> -		ring_reset(&bsd6_ring);
> -		ring_reset(&blt_ring);
> -
> -		for (i = 0; i < samples_per_sec; i++) {
> -			long long interval;
> -			ti = gettime();
> -			if (IS_965(devid)) {
> -				instdone = INREG(INSTDONE_I965);
> -				instdone1 = INREG(INSTDONE_1);
> -			} else
> -				instdone = INREG(INSTDONE);
> -
> -			for (j = 0; j < num_instdone_bits; j++)
> -				update_idle_bit(&top_bits[j]);
> -
> -			ring_sample(&render_ring);
> -			ring_sample(&bsd_ring);
> -			ring_sample(&bsd6_ring);
> -			ring_sample(&blt_ring);
> -
> -			tf = gettime();
> -			if (tf - t1 >= 1000000) {
> -				/* We are out of sync, bail out */
> -				last_samples_per_sec = i+1;
> -				break;
> -			}
> -			interval = def_sleep - (tf - ti);
> -			if (interval > 0)
> -				usleep(interval);
> -		}
> +		pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
> +		pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
> +		pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
> +		pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
> +		pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
> +			 engines->rapl_scale);
> +		pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
> +			 engines->imc_reads_scale);
> +		pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
> +			 engines->imc_writes_scale);
>   
> -		if (HAS_STATS_REGS(devid)) {
> -			for (i = 0; i < STATS_COUNT; i++) {
> -				uint32_t stats_high, stats_low, stats_high_2;
> +		if (lines++ < con_h)
> +			printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
> +			       fact, freq, rc6, power, engines->rapl_unit, irq);
>   
> -				do {
> -					stats_high = INREG(stats_regs[i] + 4);
> -					stats_low = INREG(stats_regs[i]);
> -					stats_high_2 = INREG(stats_regs[i] + 4);
> -				} while (stats_high != stats_high_2);
> +		if (lines++ < con_h)
> +			printf("\n");
>   
> -				stats[i] = (uint64_t)stats_high << 32 |
> -					stats_low;
> -			}
> -		}
> +		if (engines->imc_fd) {
> +			if (lines++ < con_h)
> +				printf("      IMC reads:   %s %s/s\n",
> +				       reads, engines->imc_reads_unit);
> +
> +			if (lines++ < con_h)
> +				printf("     IMC writes:   %s %s/s\n",
> +				       writes, engines->imc_writes_unit);
>   
> -		qsort(top_bits_sorted, num_instdone_bits,
> -		      sizeof(struct top_bit *), top_bits_sort);
> -
> -		/* Limit the number of lines printed to the terminal height so the
> -		 * most important info (at the top) will stay on screen. */
> -		max_lines = -1;
> -		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -			max_lines = ws.ws_row - 6; /* exclude header lines */
> -		if (max_lines >= num_instdone_bits)
> -			max_lines = num_instdone_bits;
> -
> -		t2 = gettime();
> -		elapsed_time += (t2 - t1) / 1000000.0;
> -
> -		if (interactive) {
> -			printf("%s", clear_screen);
> -			print_clock_info(pci_dev);
> -
> -			ring_print(&render_ring, last_samples_per_sec);
> -			ring_print(&bsd_ring, last_samples_per_sec);
> -			ring_print(&bsd6_ring, last_samples_per_sec);
> -			ring_print(&blt_ring, last_samples_per_sec);
> -
> -			printf("\n%30s  %s\n", "task", "percent busy");
> -			for (i = 0; i < max_lines; i++) {
> -				if (top_bits_sorted[i]->count > 0) {
> -					percent = (top_bits_sorted[i]->count * 100) /
> -						last_samples_per_sec;
> -					len = printf("%30s: %3d%%: ",
> -							 top_bits_sorted[i]->bit->name,
> -							 percent);
> -					print_percentage_bar (percent, len);
> -				} else {
> -					printf("%*s", PERCENTAGE_BAR_END, "");
> -				}
> -
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					printf("%13s: %llu (%lld/sec)",
> -						   stats_reg_names[i],
> -						   (long long)stats[i],
> -						   (long long)(stats[i] - last_stats[i]));
> -					last_stats[i] = stats[i];
> -				} else {
> -					if (!top_bits_sorted[i]->count)
> -						break;
> -				}
> +			if (++lines < con_h)
>   				printf("\n");
> -			}
>   		}
> -		if (output) {
> -			/* Print headers for columns at first run */
> -			if (print_headers) {
> -				fprintf(output, "# time\t");
> -				ring_print_header(output, &render_ring);
> -				ring_print_header(output, &bsd_ring);
> -				ring_print_header(output, &bsd6_ring);
> -				ring_print_header(output, &blt_ring);
> -				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -						fprintf(output, "%.6s\t",
> -							   stats_reg_names[i]
> -							   );
> -					}
> -					if (!top_bits[i].count)
> -						continue;
> -				}
> -				fprintf(output, "\n");
> -				print_headers = 0;
> -			}
>   
> -			/* Print statistics */
> -			fprintf(output, "%.2f\t", elapsed_time);
> -			ring_log(&render_ring, last_samples_per_sec, output);
> -			ring_log(&bsd_ring, last_samples_per_sec, output);
> -			ring_log(&bsd6_ring, last_samples_per_sec, output);
> -			ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					fprintf(output, "%"PRIu64"\t",
> -						   stats[i] - last_stats[i]);
> -					last_stats[i] = stats[i];
> -				}
> -					if (!top_bits[i].count)
> -						continue;
> -			}
> -			fprintf(output, "\n");
> -			fflush(output);
> -		}
> +		for (i = 0; i < engines->num_engines; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
>   
> -		for (i = 0; i < num_instdone_bits; i++) {
> -			top_bits_sorted[i]->count = 0;
> +			if (engine->num_counters && lines < con_h) {
> +				const char *a = "          ENGINE      BUSY ";
> +				const char *b = " MI_SEMA MI_WAIT";
>   
> -			if (i < STATS_COUNT)
> -				last_stats[i] = stats[i];
> +				printf("\033[7m%s%*s%s\033[0m\n",
> +				       a,
> +				       (int)(con_w - 1 - strlen(a) - strlen(b)),
> +				       " ", b);
> +				lines++;
> +				break;
> +			}
>   		}
>   
> -		/* Check if child has gone */
> -		if (child_pid > 0) {
> -			int res;
> -			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -				perror("waitpid");
> -				exit(1);
> -			}
> -			if (res == 0)
> +		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
> +			unsigned int max_w = con_w - 1;
> +			unsigned int len;
> +			char sema[BUFSZ];
> +			char wait[BUFSZ];
> +			char busy[BUFSZ];
> +			char buf[128];
> +			double val;
> +
> +			if (!engine->num_counters)
>   				continue;
> -			if (WIFEXITED(child_stat))
> -				break;
> +
> +			pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
> +			pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
> +			len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
> +				       sema, wait);
> +
> +			pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
> +				 100);
> +			len += printf("%16s %s%% ", engine->display_name, busy);
> +
> +			val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
> +			print_percentage_bar(val, max_w - len);
> +
> +			printf("%s\n", buf);
> +
> +			lines++;
>   		}
> -	}
>   
> -	fclose(output);
> +		if (lines++ < con_h)
> +			printf("\n");
> +
> +		usleep(period_us);
> +	}
>   
> -	intel_register_access_fini();
>   	return 0;
>   }
> diff --git a/tools/meson.build b/tools/meson.build
> index bd2d313d5156..a918eeb0bef1 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -23,7 +23,6 @@ tools_progs = [
>   	'intel_gpu_frequency',
>   	'intel_firmware_decode',
>   	'intel_gpu_time',
> -	'intel_gpu_top',
>   	'intel_gtt',
>   	'intel_guc_logger',
>   	'intel_infoframes',
> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>   	       name_prefix : '',
>   	       install : true)
>   
> +executable('intel_gpu_top', 'intel_gpu_top.c',
> +	   install : true,
> +	   install_rpath : rpathdir,
> +	   dependencies : tool_deps + [ lib_igt_perf ])
> +
>   conf_data = configuration_data()
>   conf_data.set('prefix', prefix)
>   conf_data.set('exec_prefix', '${prefix}')
> 
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v6] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-09 12:26       ` Tvrtko Ursulin
  (?)
@ 2018-04-23 12:18       ` Rinat Ibragimov
  -1 siblings, 0 replies; 57+ messages in thread
From: Rinat Ibragimov @ 2018-04-23 12:18 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Eero Tamminen, igt-dev, Ben Widawsky, Intel-gfx

Ping?


>Понедельник,  9 апреля 2018, 15:26 +03:00 от Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>:
>
>
>[Adding some people to Cc for more ack/nack type feedback.]
>
>Executive question is ack or nack on replacing intel_gpu_top with a new 
>implementation which uses only perf PMU for counter gathering.
>
>A short history on how this came to be:
>
>There was a recent external patch contribution from Rinat Ibragimov to 
>support more platforms from the existing intel_gpu_top. But as the tool 
>is not safe to use Chris Wilson suggested to maybe just replace it.
>
>As it happens I had a good start to do this quickly and cheaply, in the 
>form of one prototype I did recently, which only needed ripping some 
>bits out, and polishing the rest.
>
>Eero and Rinat kindly did a lot of platform coverage testing and the 
>rewrite seems ready for next steps.
>
>I need to stress that as the commit notes, the new tool has a slightly 
>different scope as that it doesn't expose GPU functional level data, but 
>only overall stats like power, frequencies, RC6, interrupts, IMC memory 
>bandwidth and per command streamer busyness, mi_semaphore and mi_event 
>waits. My thinking was that for more functional level profiling gpu-top 
>(OA) should be used.
>
>Also the "run a command" and CSV output features are not not supported 
>since both can be done directly via perf stat.
>
>Regards,
>
>Tvrtko
>
>On 04/04/2018 16:26, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin < tvrtko.ursulin@intel.com >
>> 
>> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
>> register access. This patch rewrites it to use only PMU.
>> 
>> Only overall command streamer busyness and GPU global data such as power
>> and frequencies are included in this new version.
>> 
>> For access to more GPU functional unit level data, an OA metric based tool
>> like gpu-top should be used instead.
>> 
>> v2:
>>   * Sort engines by class and instance.
>>   * Do not wait for one sampling period to display something on screen.
>>   * Move code out of the asserts. (Rinat Ibragimov)
>>   * Continuously adapt to terminal size. (Rinat Ibragimov)
>> 
>> v3:
>>   * Change layout and precision of some field. (Chris Wilson)
>>   Eero Tamminen:
>>   * Use more user friendly engine names.
>>   * Don't error out if a counter is missing.
>>   * Add IMC read/write bandwidth.
>>   * Report minimum required kernel version.
>> 
>> v4:
>>   * Really support 4.16 by skipping of missing engines.
>>   * Simpler and less hacky float printing.
>>   * Preserve copyright header. (Antonio Argenziano)
>>   * Simplify engines_ptr macro. (Rinat Ibragimov)
>> 
>> v5:
>>   * Get RAPL unit from sysfs.
>>   * Consolidate sysfs paths with a macro.
>>   * Tidy error handling by carrying over and reporting errno.
>>   * Check against console height on all prints.
>>   * More readable minimum kernel version message. (Eero Tamminen)
>>   * Column banner for per engine stats. (Eero Tamminen)
>> 
>> v6:
>>   * Man page update. (Eero Tamminen)
>> 
>> Signed-off-by: Tvrtko Ursulin < tvrtko.ursulin@intel.com >
>> Cc: Chris Wilson < chris@chris-wilson.co.uk >
>> Cc: Lionel Landwerlin < lionel.g.landwerlin@intel.com >
>> Cc: Petri Latvala < petri.latvala@intel.com >
>> Cc: Eero Tamminen < eero.t.tamminen@intel.com >
>> Cc: Rinat Ibragimov < ibragimovrinat@mail.ru >
>> Reviewed-by: Lionel Landwerlin < lionel.g.landwerlin@intel.com > # v1
>> Reviewed-by: Chris Wilson < chris@chris-wilson.co.uk > # v0.5
>> ---
>>   lib/igt_perf.c        |    6 +
>>   lib/igt_perf.h        |    1 +
>>   man/intel_gpu_top.rst |   41 +-
>>   tools/Makefile.am     |    2 +
>>   tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
>>   tools/meson.build     |    6 +-
>>   6 files changed, 719 insertions(+), 587 deletions(-)
>> 
>> diff --git a/lib/igt_perf.c b/lib/igt_perf.c
>> index 99d82ea51c9b..e3dec2cc29c7 100644
>> --- a/lib/igt_perf.c
>> +++ b/lib/igt_perf.c
>> @@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
>>   return _perf_open(type, config, -1,
>>     PERF_FORMAT_TOTAL_TIME_ENABLED);
>>   }
>> +
>> +int igt_perf_open_group(uint64_t type, uint64_t config, int group)
>> +{
>> +return _perf_open(type, config, group,
>> +  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
>> +}
>> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
>> index 614ea5d23fa6..e00718f4769a 100644
>> --- a/lib/igt_perf.h
>> +++ b/lib/igt_perf.h
>> @@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
>>   int perf_i915_open(uint64_t config);
>>   int perf_i915_open_group(uint64_t config, int group);
>>   int igt_perf_open(uint64_t type, uint64_t config);
>> +int igt_perf_open_group(uint64_t type, uint64_t config, int group);
>> 
>>   #endif /* I915_PERF_H */
>> diff --git a/man/intel_gpu_top.rst b/man/intel_gpu_top.rst
>> index a5f7175bb1a0..19c712307d28 100644
>> --- a/man/intel_gpu_top.rst
>> +++ b/man/intel_gpu_top.rst
>> @@ -7,9 +7,9 @@ Display a top-like summary of Intel GPU usage
>>   ---------------------------------------------
>>   .. include:: defs.rst
>>   :Author: IGT Developers < igt-dev@lists.freedesktop.org >
>> -:Date: 2016-03-01
>> +:Date: 2018-04-04
>>   :Version: |PACKAGE_STRING|
>> -:Copyright: 2009,2011,2012,2016 Intel Corporation
>> +:Copyright: 2009,2011,2012,2016,2018 Intel Corporation
>>   :Manual section: |MANUAL_SECTION|
>>   :Manual group: |MANUAL_GROUP|
>> 
>> @@ -21,42 +21,25 @@ SYNOPSIS
>>   DESCRIPTION
>>   ===========
>> 
>> -**intel_gpu_top** is a tool to display usage information of an Intel GPU. It
>> -requires root privilege to map the graphics device.
>> +**intel_gpu_top** is a tool to display usage information on Intel GPU's.
>> +
>> +The tool gathers data using perf performance counters (PMU) exposed by i915 and other platform drivers like RAPL (power) and Uncore IMC (memory bandwidth).
>> 
>>   OPTIONS
>>   =======
>> 
>> --s SAMPLES
>> -    Number of samples to acquire per second.
>> -
>> --o FILE
>> -    Collect usage statistics to FILE. If file is "-", run non-interactively
>> -    and output statistics to stdout.
>> -
>> --e COMMAND
>> -    Execute COMMAND to profile, and leave when it is finished. Note that the
>> -    entire command with all parameters should be included as one parameter.
>> +-s <ms>
>> +    Refresh period in milliseconds.
>> 
>>   -h
>> -    Show usage notes.
>> +    Show help text.
>> 
>> -EXAMPLES
>> -========
>> -
>> -intel_gpu_top -o "cairo-trace-gvim.log" -s 100 -e "cairo-perf-trace /tmp/gvim"
>> -    Run cairo-perf-trace with /tmp/gvim trace, non-interactively, saving the
>> -    statistics into cairo-trace-gvim.log file, and collecting 100 samples per
>> -    second.
>> -
>> -Note that idle units are not displayed, so an entirely idle GPU will only
>> -display the ring status and header.
>> +LIMITATIONS
>> +===========
>> 
>> -BUGS
>> -====
>> +* Not all metrics are supported on all platforms. Where a metric is unsupported it's value will be replaced by a dashed line.
>> 
>> -Some GPUs report some units as busy when they aren't, such that even when idle
>> -and not hung, it will show up as 100% busy.
>> +* Non-root access to perf counters is controlled by the *perf_event_paranoid* sysctl.
>> 
>>   REPORTING BUGS
>>   ==============
>> diff --git a/tools/Makefile.am b/tools/Makefile.am
>> index 09b6dbcc3ece..a0b016ddd7ff 100644
>> --- a/tools/Makefile.am
>> +++ b/tools/Makefile.am
>> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>>   intel_aubdump_la_SOURCES = aubdump.c
>>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>> 
>> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
>> +
>>   bin_SCRIPTS = intel_aubdump
>>   CLEANFILES = $(bin_SCRIPTS)
>> 
>> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
>> index 098e6ce3ff86..b923c3cfbe97 100644
>> --- a/tools/intel_gpu_top.c
>> +++ b/tools/intel_gpu_top.c
>> @@ -1,6 +1,5 @@
>>   /*
>> - * Copyright © 2007 Intel Corporation
>> - * Copyright © 2011 Intel Corporation
>> + * Copyright © 2007-2018 Intel Corporation
>>    *
>>    * Permission is hereby granted, free of charge, to any person obtaining a
>>    * copy of this software and associated documentation files (the "Software"),
>> @@ -24,695 +23,832 @@
>>    * Authors:
>>    *    Eric Anholt < eric@anholt.net >
>>    *    Eugeni Dodonov < eugeni.dodonov@intel.com >
>> - *
>>    */
>> 
>> -#include "config.h"
>> -
>> -#include <inttypes.h>
>> -#include <unistd.h>
>> -#include <stdlib.h>
>>   #include <stdio.h>
>> -#include <err.h>
>> -#include <sys/ioctl.h>
>> -#include <sys/time.h>
>> -#include <sys/wait.h>
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +#include <stdint.h>
>> +#include <assert.h>
>>   #include <string.h>
>> -#ifdef HAVE_TERMIOS_H
>> -#include <termios.h>
>> -#endif
>> -#include "intel_io.h"
>> -#include "instdone.h"
>> -#include "intel_reg.h"
>> -#include "intel_chipset.h"
>> -#include "drmtest.h"
>> -
>> -#define  FORCEWAKE    0xA18C
>> -#define  FORCEWAKE_ACK    0x130090
>> -
>> -#define SAMPLES_PER_SEC             10000
>> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
>> -
>> -#define MAX_NUM_TOP_BITS            100
>> -
>> -#define HAS_STATS_REGS(devid)IS_965(devid)
>> -
>> -struct top_bit {
>> -struct instdone_bit *bit;
>> -int count;
>> -} top_bits[MAX_NUM_TOP_BITS];
>> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
>> -
>> -static uint32_t instdone, instdone1;
>> -
>> -static const char *bars[] = {
>> -" ",
>> -"▏",
>> -"▎",
>> -"▍",
>> -"▌",
>> -"▋",
>> -"▊",
>> -"▉",
>> -"█"
>> -};
>> +#include <ctype.h>
>> +#include <stdlib.h>
>> +#include <unistd.h>
>> +#include <sys/stat.h>
>> +#include <fcntl.h>
>> +#include <inttypes.h>
>> +#include <sys/ioctl.h>
>> +#include <errno.h>
>> +#include <math.h>
>> +#include <locale.h>
>> +
>> +#include "igt_perf.h"
>> 
>> -enum stats_counts {
>> -IA_VERTICES,
>> -IA_PRIMITIVES,
>> -VS_INVOCATION,
>> -GS_INVOCATION,
>> -GS_PRIMITIVES,
>> -CL_INVOCATION,
>> -CL_PRIMITIVES,
>> -PS_INVOCATION,
>> -PS_DEPTH,
>> -STATS_COUNT
>> +struct pmu_pair {
>> +uint64_t cur;
>> +uint64_t prev;
>>   };
>> 
>> -const uint32_t stats_regs[STATS_COUNT] = {
>> -IA_VERTICES_COUNT_QW,
>> -IA_PRIMITIVES_COUNT_QW,
>> -VS_INVOCATION_COUNT_QW,
>> -GS_INVOCATION_COUNT_QW,
>> -GS_PRIMITIVES_COUNT_QW,
>> -CL_INVOCATION_COUNT_QW,
>> -CL_PRIMITIVES_COUNT_QW,
>> -PS_INVOCATION_COUNT_QW,
>> -PS_DEPTH_COUNT_QW,
>> +struct pmu_counter {
>> +bool present;
>> +uint64_t config;
>> +unsigned int idx;
>> +struct pmu_pair val;
>>   };
>> 
>> -const char *stats_reg_names[STATS_COUNT] = {
>> -"vert fetch",
>> -"prim fetch",
>> -"VS invocations",
>> -"GS invocations",
>> -"GS prims",
>> -"CL invocations",
>> -"CL prims",
>> -"PS invocations",
>> -"PS depth pass",
>> +struct engine {
>> +const char *name;
>> +const char *display_name;
>> +
>> +unsigned int class;
>> +unsigned int instance;
>> +
>> +unsigned int num_counters;
>> +
>> +struct pmu_counter busy;
>> +struct pmu_counter wait;
>> +struct pmu_counter sema;
>>   };
>> 
>> -uint64_t stats[STATS_COUNT];
>> -uint64_t last_stats[STATS_COUNT];
>> +struct engines {
>> +unsigned int num_engines;
>> +unsigned int num_counters;
>> +DIR *root;
>> +int fd;
>> +struct pmu_pair ts;
>> +
>> +int rapl_fd;
>> +double rapl_scale;
>> +const char *rapl_unit;
>> +
>> +int imc_fd;
>> +double imc_reads_scale;
>> +const char *imc_reads_unit;
>> +double imc_writes_scale;
>> +const char *imc_writes_unit;
>> +
>> +struct pmu_counter freq_req;
>> +struct pmu_counter freq_act;
>> +struct pmu_counter irq;
>> +struct pmu_counter rc6;
>> +struct pmu_counter rapl;
>> +struct pmu_counter imc_reads;
>> +struct pmu_counter imc_writes;
>> +
>> +struct engine engine;
>> +};
>> 
>> -static unsigned long
>> -gettime(void)
>> +static uint64_t
>> +get_pmu_config(int dirfd, const char *name, const char *counter)
>>   {
>> -    struct timeval t;
>> -    gettimeofday(&t, NULL);
>> -    return (t.tv_usec + (t.tv_sec * 1000000));
>> -}
>> +char buf[128], *p;
>> +int fd, ret;
>> 
>> -static int
>> -top_bits_sort(const void *a, const void *b)
>> -{
>> -struct top_bit * const *bit_a = a;
>> -struct top_bit * const *bit_b = b;
>> -int a_count = (*bit_a)->count;
>> -int b_count = (*bit_b)->count;
>> +ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
>> +if (ret < 0 || ret == sizeof(buf))
>> +return -1;
>> 
>> -if (a_count < b_count)
>> -return 1;
>> -else if (a_count == b_count)
>> -return 0;
>> -else
>> +fd = openat(dirfd, buf, O_RDONLY);
>> +if (fd < 0)
>>   return -1;
>> -}
>> 
>> -static void
>> -update_idle_bit(struct top_bit *top_bit)
>> -{
>> -uint32_t reg_val;
>> +ret = read(fd, buf, sizeof(buf));
>> +close(fd);
>> +if (ret <= 0)
>> +return -1;
>> 
>> -if (top_bit->bit->reg == INSTDONE_1)
>> -reg_val = instdone1;
>> -else
>> -reg_val = instdone;
>> +p = index(buf, '0');
>> +if (!p)
>> +return -1;
>> 
>> -if ((reg_val & top_bit->bit->bit) == 0)
>> -top_bit->count++;
>> +return strtoul(p, NULL, 0);
>>   }
>> 
>> -static void
>> -print_clock(const char *name, int clock) {
>> -if (clock == -1)
>> -printf("%s clock: unknown", name);
>> +#define engine_ptr(engines, n) (&engines->engine + (n))
>> +
>> +static const char *class_display_name(unsigned int class)
>> +{
>> +switch (class) {
>> +case I915_ENGINE_CLASS_RENDER:
>> +return "Render/3D";
>> +case I915_ENGINE_CLASS_COPY:
>> +return "Blitter";
>> +case I915_ENGINE_CLASS_VIDEO:
>> +return "Video";
>> +case I915_ENGINE_CLASS_VIDEO_ENHANCE:
>> +return "VideoEnhance";
>> +default:
>> +return "[unknown]";
>> +}
>> +}
>> +
>> +static int engine_cmp(const void *__a, const void *__b)
>> +{
>> +const struct engine *a = (struct engine *)__a;
>> +const struct engine *b = (struct engine *)__b;
>> +
>> +if (a->class != b->class)
>> +return a->class - b->class;
>>   else
>> -printf("%s clock: %d Mhz", name, clock);
>> +return a->instance - b->instance;
>>   }
>> 
>> -static int
>> -print_clock_info(struct pci_device *pci_dev)
>> +static struct engines *discover_engines(void)
>>   {
>> -uint32_t devid = pci_dev->device_id;
>> -uint16_t gcfgc;
>> +const char *sysfs_root = "/sys/devices/i915/events";
>> +struct engines *engines;
>> +struct dirent *dent;
>> +int ret = 0;
>> +DIR *d;
>> 
>> -if (IS_GM45(devid)) {
>> -int core_clock = -1;
>> +engines = malloc(sizeof(struct engines));
>> +if (!engines)
>> +return NULL;
>> 
>> -pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +memset(engines, 0, sizeof(*engines));
>> 
>> -switch (gcfgc & 0xf) {
>> -case 8:
>> -core_clock = 266;
>> -break;
>> -case 9:
>> -core_clock = 320;
>> -break;
>> -case 11:
>> -core_clock = 400;
>> -break;
>> -case 13:
>> -core_clock = 533;
>> -break;
>> -}
>> -print_clock("core", core_clock);
>> -} else if (IS_965(devid) && IS_MOBILE(devid)) {
>> -int render_clock = -1, sampler_clock = -1;
>> +engines->num_engines = 0;
>> 
>> -pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +d = opendir(sysfs_root);
>> +if (!d)
>> +return NULL;
>> 
>> -switch (gcfgc & 0xf) {
>> -case 2:
>> -render_clock = 250; sampler_clock = 267;
>> -break;
>> -case 3:
>> -render_clock = 320; sampler_clock = 333;
>> -break;
>> -case 4:
>> -render_clock = 400; sampler_clock = 444;
>> -break;
>> -case 5:
>> -render_clock = 500; sampler_clock = 533;
>> +while ((dent = readdir(d)) != NULL) {
>> +const char *endswith = "-busy";
>> +const unsigned int endlen = strlen(endswith);
>> +struct engine *engine =
>> +engine_ptr(engines, engines->num_engines);
>> +char buf[256];
>> +
>> +if (dent->d_type != DT_REG)
>> +continue;
>> +
>> +if (strlen(dent->d_name) >= sizeof(buf)) {
>> +ret = ENAMETOOLONG;
>>   break;
>>   }
>> 
>> -print_clock("render", render_clock);
>> -printf("  ");
>> -print_clock("sampler", sampler_clock);
>> -} else if (IS_945(devid) && IS_MOBILE(devid)) {
>> -int render_clock = -1, display_clock = -1;
>> +strcpy(buf, dent->d_name);
>> 
>> -pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +/* xxxN-busy */
>> +if (strlen(buf) < (endlen + 4))
>> +continue;
>> +if (strcmp(&buf[strlen(buf) - endlen], endswith))
>> +continue;
>> 
>> -switch (gcfgc & 0x7) {
>> -case 0:
>> -render_clock = 166;
>> -break;
>> -case 1:
>> -render_clock = 200;
>> -break;
>> -case 3:
>> -render_clock = 250;
>> -break;
>> -case 5:
>> -render_clock = 400;
>> +memset(engine, 0, sizeof(*engine));
>> +
>> +buf[strlen(buf) - endlen] = 0;
>> +engine->name = strdup(buf);
>> +if (!engine->name) {
>> +ret = errno;
>>   break;
>>   }
>> 
>> -switch (gcfgc & 0x70) {
>> -case 0:
>> -display_clock = 200;
>> -break;
>> -case 4:
>> -display_clock = 320;
>> +engine->busy.config = get_pmu_config(dirfd(d), engine->name,
>> +     "busy");
>> +if (engine->busy.config == -1) {
>> +ret = ENOENT;
>>   break;
>>   }
>> -if (gcfgc & (1 << 7))
>> -    display_clock = 133;
>> 
>> -print_clock("render", render_clock);
>> -printf("  ");
>> -print_clock("display", display_clock);
>> -} else if (IS_915(devid) && IS_MOBILE(devid)) {
>> -int render_clock = -1, display_clock = -1;
>> +engine->class = (engine->busy.config &
>> + (__I915_PMU_OTHER(0) - 1)) >>
>> +I915_PMU_CLASS_SHIFT;
>> 
>> -pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
>> +engine->instance = (engine->busy.config >>
>> +    I915_PMU_SAMPLE_BITS) &
>> +    ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>> 
>> -switch (gcfgc & 0x7) {
>> -case 0:
>> -render_clock = 160;
>> -break;
>> -case 1:
>> -render_clock = 190;
>> -break;
>> -case 4:
>> -render_clock = 333;
>> +ret = snprintf(buf, sizeof(buf), "%s/%u",
>> +       class_display_name(engine->class),
>> +       engine->instance);
>> +if (ret < 0 || ret == sizeof(buf)) {
>> +ret = ENOBUFS;
>>   break;
>>   }
>> -if (gcfgc & (1 << 13))
>> -    render_clock = 133;
>> +ret = 0;
>> 
>> -switch (gcfgc & 0x70) {
>> -case 0:
>> -display_clock = 190;
>> +engine->display_name = strdup(buf);
>> +if (!engine->display_name) {
>> +ret = errno;
>>   break;
>> -case 4:
>> -display_clock = 333;
>> +}
>> +
>> +engines->num_engines++;
>> +engines = realloc(engines, sizeof(struct engines) +
>> +  engines->num_engines * sizeof(struct engine));
>> +if (!engines) {
>> +ret = errno;
>>   break;
>>   }
>> -if (gcfgc & (1 << 7))
>> -    display_clock = 133;
>> +}
>> +
>> +if (ret) {
>> +free(engines);
>> +errno = ret;
>> 
>> -print_clock("render", render_clock);
>> -printf("  ");
>> -print_clock("display", display_clock);
>> +return NULL;
>>   }
>> 
>> +qsort(engine_ptr(engines, 0), engines->num_engines,
>> +      sizeof(struct engine), engine_cmp);
>> +
>> +engines->root = d;
>> 
>> -printf("\n");
>> -return -1;
>> +return engines;
>>   }
>> 
>> -#define STATS_LEN (20)
>> -#define PERCENTAGE_BAR_END(79 - STATS_LEN)
>> +static int
>> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
>> +{
>> +int fd, err;
>> +ssize_t ret;
>> 
>> -static void
>> -print_percentage_bar(float percent, int cur_line_len)
>> +fd = open(filename, O_RDONLY);
>> +if (fd < 0)
>> +return -1;
>> +
>> +ret = read(fd, buf, bufsize - 1);
>> +err = errno;
>> +close(fd);
>> +if (ret < 1) {
>> +errno = ret < 0 ? err : ENOMSG;
>> +
>> +return -1;
>> +}
>> +
>> +if (ret > 1 && buf[ret - 1] == '\n')
>> +buf[ret - 1] = '\0';
>> +else
>> +buf[ret] = '\0';
>> +
>> +return 0;
>> +}
>> +
>> +static uint64_t filename_to_u64(const char *filename, int base)
>>   {
>> -int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
>> -int bar_len = bar_avail_len * (percent + .5) / 100.0;
>> -int i;
>> +char buf[64], *b;
>> 
>> -for (i = bar_len; i >= 8; i -= 8) {
>> -printf("%s", bars[8]);
>> -cur_line_len++;
>> +if (filename_to_buf(filename, buf, sizeof(buf)))
>> +return 0;
>> +
>> +/*
>> + * Handle both single integer and key=value formats by skipping
>> + * leading non-digits.
>> + */
>> +b = buf;
>> +while (*b && !isdigit(*b))
>> +b++;
>> +
>> +return strtoull(b, NULL, base);
>> +}
>> +
>> +static double filename_to_double(const char *filename)
>> +{
>> +char *oldlocale;
>> +char buf[80];
>> +double v;
>> +
>> +if (filename_to_buf(filename, buf, sizeof(buf)))
>> +return 0;
>> +
>> +oldlocale = setlocale(LC_ALL, "C");
>> +v = strtod(buf, NULL);
>> +setlocale(LC_ALL, oldlocale);
>> +
>> +return v;
>> +}
>> +
>> +#define RAPL_ROOT "/sys/devices/power/"
>> +#define RAPL_EVENT "/sys/devices/power/events/"
>> +
>> +static uint64_t rapl_type_id(void)
>> +{
>> +return filename_to_u64(RAPL_ROOT "type", 10);
>> +}
>> +
>> +static uint64_t rapl_gpu_power(void)
>> +{
>> +return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
>> +}
>> +
>> +static double rapl_gpu_power_scale(void)
>> +{
>> +return filename_to_double(RAPL_EVENT "energy-gpu.scale");
>> +}
>> +
>> +static const char *rapl_gpu_power_unit(void)
>> +{
>> +char buf[32];
>> +
>> +if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
>> +    buf, sizeof(buf)) == 0)
>> +if (!strcmp(buf, "Joules"))
>> +return strdup("Watts");
>> +else
>> +return strdup(buf);
>> +else
>> +return NULL;
>> +}
>> +
>> +#define IMC_ROOT "/sys/devices/uncore_imc/"
>> +#define IMC_EVENT "/sys/devices/uncore_imc/events/"
>> +
>> +static uint64_t imc_type_id(void)
>> +{
>> +return filename_to_u64(IMC_ROOT "type", 10);
>> +}
>> +
>> +static uint64_t imc_data_reads(void)
>> +{
>> +return filename_to_u64(IMC_EVENT "data_reads", 0);
>> +}
>> +
>> +static double imc_data_reads_scale(void)
>> +{
>> +return filename_to_double(IMC_EVENT "data_reads.scale");
>> +}
>> +
>> +static const char *imc_data_reads_unit(void)
>> +{
>> +char buf[32];
>> +
>> +if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
>> +return strdup(buf);
>> +else
>> +return NULL;
>> +}
>> +
>> +static uint64_t imc_data_writes(void)
>> +{
>> +return filename_to_u64(IMC_EVENT "data_writes", 0);
>> +}
>> +
>> +static double imc_data_writes_scale(void)
>> +{
>> +return filename_to_double(IMC_EVENT "data_writes.scale");
>> +}
>> +
>> +static const char *imc_data_writes_unit(void)
>> +{
>> +char buf[32];
>> +
>> +if (filename_to_buf(IMC_EVENT "data_writes.unit",
>> +    buf, sizeof(buf)) == 0)
>> +return strdup(buf);
>> +else
>> +return NULL;
>> +}
>> +
>> +#define _open_pmu(cnt, pmu, fd) \
>> +({ \
>> +int fd__; \
>> +\
>> +fd__ = perf_i915_open_group((pmu)->config, (fd)); \
>> +if (fd__ >= 0) { \
>> +if ((fd) == -1) \
>> +(fd) = fd__; \
>> +(pmu)->present = true; \
>> +(pmu)->idx = (cnt)++; \
>> +} \
>> +\
>> +fd__; \
>> +})
>> +
>> +#define _open_imc(cnt, pmu, fd) \
>> +({ \
>> +int fd__; \
>> +\
>> +fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
>> +if (fd__ >= 0) { \
>> +if ((fd) == -1) \
>> +(fd) = fd__; \
>> +(pmu)->present = true; \
>> +(pmu)->idx = (cnt)++; \
>> +} \
>> +\
>> +fd__; \
>> +})
>> +
>> +static int pmu_init(struct engines *engines)
>> +{
>> +unsigned int i;
>> +int fd;
>> +
>> +engines->fd = -1;
>> +engines->num_counters = 0;
>> +
>> +engines->irq.config = I915_PMU_INTERRUPTS;
>> +fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
>> +if (fd < 0)
>> +return -1;
>> +
>> +engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
>> +_open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
>> +
>> +engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
>> +_open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
>> +
>> +engines->rc6.config = I915_PMU_RC6_RESIDENCY;
>> +_open_pmu(engines->num_counters, &engines->rc6, engines->fd);
>> +
>> +for (i = 0; i < engines->num_engines; i++) {
>> +struct engine *engine = engine_ptr(engines, i);
>> +struct {
>> +struct pmu_counter *pmu;
>> +const char *counter;
>> +} *cnt, counters[] = {
>> +{ .pmu = &engine->busy, .counter = "busy" },
>> +{ .pmu = &engine->wait, .counter = "wait" },
>> +{ .pmu = &engine->sema, .counter = "sema" },
>> +{ .pmu = NULL, .counter = NULL },
>> +};
>> +
>> +for (cnt = counters; cnt->pmu; cnt++) {
>> +if (!cnt->pmu->config)
>> +cnt->pmu->config =
>> +get_pmu_config(dirfd(engines->root),
>> +       engine->name,
>> +       cnt->counter);
>> +fd = _open_pmu(engines->num_counters, cnt->pmu,
>> +       engines->fd);
>> +if (fd >= 0)
>> +engine->num_counters++;
>> +}
>>   }
>> -if (i) {
>> -printf("%s", bars[i]);
>> -cur_line_len++;
>> +
>> +engines->rapl_fd = -1;
>> +if (rapl_type_id()) {
>> +engines->rapl_scale = rapl_gpu_power_scale();
>> +engines->rapl_unit = rapl_gpu_power_unit();
>> +if (!engines->rapl_unit)
>> +return -1;
>> +
>> +engines->rapl.config = rapl_gpu_power();
>> +if (!engines->rapl.config)
>> +return -1;
>> +
>> +engines->rapl_fd = igt_perf_open(rapl_type_id(),
>> + engines->rapl.config);
>> +if (engines->rapl_fd < 0)
>> +return -1;
>> +
>> +engines->rapl.present = true;
>>   }
>> 
>> -/* NB: We can't use a field width with utf8 so we manually
>> -* guarantee a field with of 45 chars for any bar. */
>> -printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
>> -}
>> +engines->imc_fd = -1;
>> +if (imc_type_id()) {
>> +unsigned int num = 0;
>> 
>> -struct ring {
>> -const char *name;
>> -uint32_t mmio;
>> -int head, tail, size;
>> -uint64_t full;
>> -int idle;
>> -};
>> +engines->imc_reads_scale = imc_data_reads_scale();
>> +engines->imc_writes_scale = imc_data_writes_scale();
>> +
>> +engines->imc_reads_unit = imc_data_reads_unit();
>> +if (!engines->imc_reads_unit)
>> +return -1;
>> +
>> +engines->imc_writes_unit = imc_data_writes_unit();
>> +if (!engines->imc_writes_unit)
>> +return -1;
>> +
>> +engines->imc_reads.config = imc_data_reads();
>> +if (!engines->imc_reads.config)
>> +return -1;
>> +
>> +engines->imc_writes.config = imc_data_writes();
>> +if (!engines->imc_writes.config)
>> +return -1;
>> +
>> +fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
>> +if (fd < 0)
>> +return -1;
>> +fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
>> +if (fd < 0)
>> +return -1;
>> +
>> +engines->imc_reads.present = true;
>> +engines->imc_writes.present = true;
>> +}
>> +
>> +return 0;
>> +}
>> 
>> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
>> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>>   {
>> -return INREG(ring->mmio + reg);
>> +uint64_t buf[2 + num];
>> +unsigned int i;
>> +ssize_t len;
>> +
>> +memset(buf, 0, sizeof(buf));
>> +
>> +len = read(fd, buf, sizeof(buf));
>> +assert(len == sizeof(buf));
>> +
>> +for (i = 0; i < num; i++)
>> +val[i] = buf[2 + i];
>> +
>> +return buf[1];
>>   }
>> 
>> -static void ring_init(struct ring *ring)
>> +static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
>>   {
>> -ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
>> +double v;
>> +
>> +v = p->cur - p->prev;
>> +v /= d;
>> +v /= t;
>> +v *= s;
>> +
>> +if (s == 100.0 && v > 100.0)
>> +v = 100.0;
>> +
>> +return v;
>>   }
>> 
>> -static void ring_reset(struct ring *ring)
>> +static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
>>   {
>> -ring->idle = ring->full = 0;
>> +unsigned int i;
>> +
>> +for (i = 0; i < num && i < (bufsz - 1); i++)
>> +*buf++ = c;
>> +
>> +*buf = 0;
>>   }
>> 
>> -static void ring_sample(struct ring *ring)
>> +static void pmu_calc(struct pmu_counter *cnt,
>> +     char *buf, unsigned int bufsz,
>> +     unsigned int width, unsigned width_dec,
>> +     double d, double t, double s)
>>   {
>> -int full;
>> +double val;
>> +int len;
>> +
>> +assert(bufsz >= (width + width_dec + 1));
>> +
>> +if (!cnt->present) {
>> +fill_str(buf, bufsz, '-', width + width_dec);
>> +return;
>> +}
>> 
>> -if (!ring->size)
>> +val = __pmu_calc(&cnt->val, d, t, s);
>> +
>> +len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
>> +if (len < 0 || len == bufsz) {
>> +fill_str(buf, bufsz, 'X', width + width_dec);
>>   return;
>> +}
>> +}
>> +
>> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>> +{
>> +uint64_t data[2] = { };
>> +ssize_t len;
>> 
>> -ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
>> -ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
>> +len = read(fd, data, sizeof(data));
>> +assert(len == sizeof(data));
>> 
>> -if (ring->tail == ring->head)
>> -ring->idle++;
>> +if (ts)
>> +*ts = data[1];
>> 
>> -full = ring->tail - ring->head;
>> -if (full < 0)
>> -full += ring->size;
>> -ring->full += full;
>> +return data[0];
>>   }
>> 
>> -static void ring_print_header(FILE *out, struct ring *ring)
>> +static uint64_t pmu_read_single(int fd)
>>   {
>> -    fprintf(out, "%.6s%%\tops\t",
>> -            ring->name
>> -          );
>> +return __pmu_read_single(fd, NULL);
>>   }
>> 
>> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
>> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>>   {
>> -int percent_busy, len;
>> +counter->val.prev = counter->val.cur;
>> +counter->val.cur = val;
>> +}
>> 
>> -if (!ring->size)
>> -return;
>> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>> +{
>> +if (counter->present)
>> +__update_sample(counter, val[counter->idx]);
>> +}
>> +
>> +static void pmu_sample(struct engines *engines)
>> +{
>> +const int num_val = engines->num_counters;
>> +uint64_t val[2 + num_val];
>> +unsigned int i;
>> +
>> +engines->ts.prev = engines->ts.cur;
>> +
>> +if (engines->rapl_fd >= 0)
>> +__update_sample(&engines->rapl,
>> +pmu_read_single(engines->rapl_fd));
>> +
>> +if (engines->imc_fd >= 0) {
>> +pmu_read_multi(engines->imc_fd, 2, val);
>> +update_sample(&engines->imc_reads, val);
>> +update_sample(&engines->imc_writes, val);
>> +}
>> 
>> -percent_busy = 100 - 100 * ring->idle / samples_per_sec;
>> +engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>> 
>> -len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
>> -print_percentage_bar (percent_busy, len);
>> -printf("%24s space: %d/%d\n",
>> -   ring->name,
>> -   (int)(ring->full / samples_per_sec),
>> -   ring->size);
>> +update_sample(&engines->freq_req, val);
>> +update_sample(&engines->freq_act, val);
>> +update_sample(&engines->irq, val);
>> +update_sample(&engines->rc6, val);
>> +
>> +for (i = 0; i < engines->num_engines; i++) {
>> +struct engine *engine = engine_ptr(engines, i);
>> +
>> +update_sample(&engine->busy, val);
>> +update_sample(&engine->sema, val);
>> +update_sample(&engine->wait, val);
>> +}
>>   }
>> 
>> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
>> -FILE *output)
>> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
>> +
>> +static void
>> +print_percentage_bar(double percent, int max_len)
>>   {
>> -if (ring->size)
>> -fprintf(output, "%3d\t%d\t",
>> -(int)(100 - 100 * ring->idle / samples_per_sec),
>> -(int)(ring->full / samples_per_sec));
>> -else
>> -fprintf(output, "-1\t-1\t");
>> +int bar_len = percent * (8 * (max_len - 2)) / 100.0;
>> +int i;
>> +
>> +putchar('|');
>> +
>> +for (i = bar_len; i >= 8; i -= 8)
>> +printf("%s", bars[8]);
>> +if (i)
>> +printf("%s", bars[i]);
>> +
>> +for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
>> +putchar(' ');
>> +
>> +putchar('|');
>>   }
>> 
>> +#define DEFAULT_PERIOD_MS (1000)
>> +
>>   static void
>>   usage(const char *appname)
>>   {
>>   printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
>> -"\n"
>> -"usage: %s [parameters]\n"
>> -"\n"
>> -"The following parameters apply:\n"
>> -"[-s <samples>]       samples per seconds (default %d)\n"
>> -"[-e <command>]       command to profile\n"
>> -"[-o <file>]          output statistics to file. If file is '-',"
>> -"                     run in batch mode and output statistics to stdio only \n"
>> -"[-h]                 show this help screen\n"
>> -"\n",
>> -appname,
>> -SAMPLES_PER_SEC
>> -  );
>> -return;
>> +"\n"
>> +"Usage: %s [parameters]\n"
>> +"\n"
>> +"\tThe following parameters are optional:\n\n"
>> +"\t[-s <ms>]       Refresh period in milliseconds (default %ums).\n"
>> +"\t[-h]            Show this help text.\n"
>> +"\n",
>> +appname, DEFAULT_PERIOD_MS);
>>   }
>> 
>>   int main(int argc, char **argv)
>>   {
>> -uint32_t devid;
>> -struct pci_device *pci_dev;
>> -struct ring render_ring = {
>> -.name = "render",
>> -.mmio = 0x2030,
>> -}, bsd_ring = {
>> -.name = "bitstream",
>> -.mmio = 0x4030,
>> -}, bsd6_ring = {
>> -.name = "bitstream",
>> -.mmio = 0x12030,
>> -}, blt_ring = {
>> -.name = "blitter",
>> -.mmio = 0x22030,
>> -};
>> -int i, ch;
>> -int samples_per_sec = SAMPLES_PER_SEC;
>> -FILE *output = NULL;
>> -double elapsed_time=0;
>> -int print_headers=1;
>> -pid_t child_pid=-1;
>> -int child_stat;
>> -char *cmd=NULL;
>> -int interactive=1;
>> -
>> -/* Parse options? */
>> -while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
>> +unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
>> +int con_w = -1, con_h = -1;
>> +struct engines *engines;
>> +unsigned int i;
>> +int ret, ch;
>> +
>> +/* Parse options */
>> +while ((ch = getopt(argc, argv, "s:h")) != -1) {
>>   switch (ch) {
>> -case 'e': cmd = strdup(optarg);
>> -break;
>> -case 's': samples_per_sec = atoi(optarg);
>> -if (samples_per_sec < 100) {
>> -fprintf(stderr, "Error: samples per second must be >= 100\n");
>> -exit(1);
>> -}
>> -break;
>> -case 'o':
>> -if (!strcmp(optarg, "-")) {
>> -/* Running in non-interactive mode */
>> -interactive = 0;
>> -output = stdout;
>> -}
>> -else
>> -output = fopen(optarg, "w");
>> -if (!output)
>> -{
>> -perror("fopen");
>> -exit(1);
>> -}
>> +case 's':
>> +period_us = atoi(optarg) * 1000;
>>   break;
>>   case 'h':
>>   usage(argv[0]);
>>   exit(0);
>> -break;
>>   default:
>> -fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
>> +fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>>   usage(argv[0]);
>>   exit(1);
>> -break;
>>   }
>>   }
>> 
>> -pci_dev = intel_get_pci_device();
>> -devid = pci_dev->device_id;
>> -intel_mmio_use_pci_bar(pci_dev);
>> -init_instdone_definitions(devid);
>> -
>> -/* Do we have a command to run? */
>> -if (cmd != NULL) {
>> -if (output) {
>> -fprintf(output, "# Profiling: %s\n", cmd);
>> -fflush(output);
>> -}
>> -child_pid = fork();
>> -if (child_pid < 0) {
>> -perror("fork");
>> -exit(1);
>> -}
>> -else if (child_pid == 0) {
>> -int res;
>> -res = system(cmd);
>> -if (res < 0)
>> -perror("running command");
>> -if (output) {
>> -fflush(output);
>> -fprintf(output, "# %s exited with status %d\n", cmd, res);
>> -fflush(output);
>> -}
>> -free(cmd);
>> -exit(0);
>> -} else {
>> -free(cmd);
>> -}
>> +engines = discover_engines();
>> +if (!engines) {
>> +fprintf(stderr,
>> +"Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
>> +strerror(errno));
>> +return 1;
>>   }
>> 
>> -for (i = 0; i < num_instdone_bits; i++) {
>> -top_bits[i].bit = &instdone_bits[i];
>> -top_bits[i].count = 0;
>> -top_bits_sorted[i] = &top_bits[i];
>> +ret = pmu_init(engines);
>> +if (ret) {
>> +fprintf(stderr,
>> +"Failed to initialize PMU! (%s)\n", strerror(errno));
>> +return 1;
>>   }
>> 
>> -/* Grab access to the registers */
>> -intel_register_access_init(pci_dev, 0, -1);
>> +pmu_sample(engines);
>> 
>> -ring_init(&render_ring);
>> -if (IS_GEN4(devid) || IS_GEN5(devid))
>> -ring_init(&bsd_ring);
>> -if (IS_GEN6(devid) || IS_GEN7(devid)) {
>> -ring_init(&bsd6_ring);
>> -ring_init(&blt_ring);
>> -}
>> +for (;;) {
>> +double t;
>> +#define BUFSZ 16
>> +char freq[BUFSZ];
>> +char fact[BUFSZ];
>> +char irq[BUFSZ];
>> +char rc6[BUFSZ];
>> +char power[BUFSZ];
>> +char reads[BUFSZ];
>> +char writes[BUFSZ];
>> +struct winsize ws;
>> +int lines = 0;
>> 
>> -/* Initialize GPU stats */
>> -if (HAS_STATS_REGS(devid)) {
>> -for (i = 0; i < STATS_COUNT; i++) {
>> -uint32_t stats_high, stats_low, stats_high_2;
>> +/* Update terminal size. */
>> +if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
>> +con_w = ws.ws_col;
>> +con_h = ws.ws_row;
>> +}
>> 
>> -do {
>> -stats_high = INREG(stats_regs[i] + 4);
>> -stats_low = INREG(stats_regs[i]);
>> -stats_high_2 = INREG(stats_regs[i] + 4);
>> -} while (stats_high != stats_high_2);
>> +pmu_sample(engines);
>> +t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>> 
>> -last_stats[i] = (uint64_t)stats_high << 32 |
>> -stats_low;
>> -}
>> -}
>> +printf("\033[H\033[J");
>> 
>> -for (;;) {
>> -int j;
>> -unsigned long long t1, ti, tf, t2;
>> -unsigned long long def_sleep = 1000000 / samples_per_sec;
>> -unsigned long long last_samples_per_sec = samples_per_sec;
>> -unsigned short int max_lines;
>> -struct winsize ws;
>> -char clear_screen[] = {0x1b, '[', 'H',
>> -       0x1b, '[', 'J',
>> -       0x0};
>> -int percent;
>> -int len;
>> -
>> -t1 = gettime();
>> -
>> -ring_reset(&render_ring);
>> -ring_reset(&bsd_ring);
>> -ring_reset(&bsd6_ring);
>> -ring_reset(&blt_ring);
>> -
>> -for (i = 0; i < samples_per_sec; i++) {
>> -long long interval;
>> -ti = gettime();
>> -if (IS_965(devid)) {
>> -instdone = INREG(INSTDONE_I965);
>> -instdone1 = INREG(INSTDONE_1);
>> -} else
>> -instdone = INREG(INSTDONE);
>> -
>> -for (j = 0; j < num_instdone_bits; j++)
>> -update_idle_bit(&top_bits[j]);
>> -
>> -ring_sample(&render_ring);
>> -ring_sample(&bsd_ring);
>> -ring_sample(&bsd6_ring);
>> -ring_sample(&blt_ring);
>> -
>> -tf = gettime();
>> -if (tf - t1 >= 1000000) {
>> -/* We are out of sync, bail out */
>> -last_samples_per_sec = i+1;
>> -break;
>> -}
>> -interval = def_sleep - (tf - ti);
>> -if (interval > 0)
>> -usleep(interval);
>> -}
>> +pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
>> +pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
>> +pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
>> +pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
>> +pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
>> + engines->rapl_scale);
>> +pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
>> + engines->imc_reads_scale);
>> +pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
>> + engines->imc_writes_scale);
>> 
>> -if (HAS_STATS_REGS(devid)) {
>> -for (i = 0; i < STATS_COUNT; i++) {
>> -uint32_t stats_high, stats_low, stats_high_2;
>> +if (lines++ < con_h)
>> +printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
>> +       fact, freq, rc6, power, engines->rapl_unit, irq);
>> 
>> -do {
>> -stats_high = INREG(stats_regs[i] + 4);
>> -stats_low = INREG(stats_regs[i]);
>> -stats_high_2 = INREG(stats_regs[i] + 4);
>> -} while (stats_high != stats_high_2);
>> +if (lines++ < con_h)
>> +printf("\n");
>> 
>> -stats[i] = (uint64_t)stats_high << 32 |
>> -stats_low;
>> -}
>> -}
>> +if (engines->imc_fd) {
>> +if (lines++ < con_h)
>> +printf("      IMC reads:   %s %s/s\n",
>> +       reads, engines->imc_reads_unit);
>> +
>> +if (lines++ < con_h)
>> +printf("     IMC writes:   %s %s/s\n",
>> +       writes, engines->imc_writes_unit);
>> 
>> -qsort(top_bits_sorted, num_instdone_bits,
>> -      sizeof(struct top_bit *), top_bits_sort);
>> -
>> -/* Limit the number of lines printed to the terminal height so the
>> - * most important info (at the top) will stay on screen. */
>> -max_lines = -1;
>> -if (ioctl(0, TIOCGWINSZ, &ws) != -1)
>> -max_lines = ws.ws_row - 6; /* exclude header lines */
>> -if (max_lines >= num_instdone_bits)
>> -max_lines = num_instdone_bits;
>> -
>> -t2 = gettime();
>> -elapsed_time += (t2 - t1) / 1000000.0;
>> -
>> -if (interactive) {
>> -printf("%s", clear_screen);
>> -print_clock_info(pci_dev);
>> -
>> -ring_print(&render_ring, last_samples_per_sec);
>> -ring_print(&bsd_ring, last_samples_per_sec);
>> -ring_print(&bsd6_ring, last_samples_per_sec);
>> -ring_print(&blt_ring, last_samples_per_sec);
>> -
>> -printf("\n%30s  %s\n", "task", "percent busy");
>> -for (i = 0; i < max_lines; i++) {
>> -if (top_bits_sorted[i]->count > 0) {
>> -percent = (top_bits_sorted[i]->count * 100) /
>> -last_samples_per_sec;
>> -len = printf("%30s: %3d%%: ",
>> - top_bits_sorted[i]->bit->name,
>> - percent);
>> -print_percentage_bar (percent, len);
>> -} else {
>> -printf("%*s", PERCENTAGE_BAR_END, "");
>> -}
>> -
>> -if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -printf("%13s: %llu (%lld/sec)",
>> -   stats_reg_names[i],
>> -   (long long)stats[i],
>> -   (long long)(stats[i] - last_stats[i]));
>> -last_stats[i] = stats[i];
>> -} else {
>> -if (!top_bits_sorted[i]->count)
>> -break;
>> -}
>> +if (++lines < con_h)
>>   printf("\n");
>> -}
>>   }
>> -if (output) {
>> -/* Print headers for columns at first run */
>> -if (print_headers) {
>> -fprintf(output, "# time\t");
>> -ring_print_header(output, &render_ring);
>> -ring_print_header(output, &bsd_ring);
>> -ring_print_header(output, &bsd6_ring);
>> -ring_print_header(output, &blt_ring);
>> -for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>> -if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -fprintf(output, "%.6s\t",
>> -   stats_reg_names[i]
>> -   );
>> -}
>> -if (!top_bits[i].count)
>> -continue;
>> -}
>> -fprintf(output, "\n");
>> -print_headers = 0;
>> -}
>> 
>> -/* Print statistics */
>> -fprintf(output, "%.2f\t", elapsed_time);
>> -ring_log(&render_ring, last_samples_per_sec, output);
>> -ring_log(&bsd_ring, last_samples_per_sec, output);
>> -ring_log(&bsd6_ring, last_samples_per_sec, output);
>> -ring_log(&blt_ring, last_samples_per_sec, output);
>> -
>> -for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
>> -if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
>> -fprintf(output, "%"PRIu64"\t",
>> -   stats[i] - last_stats[i]);
>> -last_stats[i] = stats[i];
>> -}
>> -if (!top_bits[i].count)
>> -continue;
>> -}
>> -fprintf(output, "\n");
>> -fflush(output);
>> -}
>> +for (i = 0; i < engines->num_engines; i++) {
>> +struct engine *engine = engine_ptr(engines, i);
>> 
>> -for (i = 0; i < num_instdone_bits; i++) {
>> -top_bits_sorted[i]->count = 0;
>> +if (engine->num_counters && lines < con_h) {
>> +const char *a = "          ENGINE      BUSY ";
>> +const char *b = " MI_SEMA MI_WAIT";
>> 
>> -if (i < STATS_COUNT)
>> -last_stats[i] = stats[i];
>> +printf("\033[7m%s%*s%s\033[0m\n",
>> +       a,
>> +       (int)(con_w - 1 - strlen(a) - strlen(b)),
>> +       " ", b);
>> +lines++;
>> +break;
>> +}
>>   }
>> 
>> -/* Check if child has gone */
>> -if (child_pid > 0) {
>> -int res;
>> -if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
>> -perror("waitpid");
>> -exit(1);
>> -}
>> -if (res == 0)
>> +for (i = 0; i < engines->num_engines && lines < con_h; i++) {
>> +struct engine *engine = engine_ptr(engines, i);
>> +unsigned int max_w = con_w - 1;
>> +unsigned int len;
>> +char sema[BUFSZ];
>> +char wait[BUFSZ];
>> +char busy[BUFSZ];
>> +char buf[128];
>> +double val;
>> +
>> +if (!engine->num_counters)
>>   continue;
>> -if (WIFEXITED(child_stat))
>> -break;
>> +
>> +pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
>> +pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
>> +len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
>> +       sema, wait);
>> +
>> +pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
>> + 100);
>> +len += printf("%16s %s%% ", engine->display_name, busy);
>> +
>> +val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
>> +print_percentage_bar(val, max_w - len);
>> +
>> +printf("%s\n", buf);
>> +
>> +lines++;
>>   }
>> -}
>> 
>> -fclose(output);
>> +if (lines++ < con_h)
>> +printf("\n");
>> +
>> +usleep(period_us);
>> +}
>> 
>> -intel_register_access_fini();
>>   return 0;
>>   }
>> diff --git a/tools/meson.build b/tools/meson.build
>> index bd2d313d5156..a918eeb0bef1 100644
>> --- a/tools/meson.build
>> +++ b/tools/meson.build
>> @@ -23,7 +23,6 @@ tools_progs = [
>>   'intel_gpu_frequency',
>>   'intel_firmware_decode',
>>   'intel_gpu_time',
>> -'intel_gpu_top',
>>   'intel_gtt',
>>   'intel_guc_logger',
>>   'intel_infoframes',
>> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>>          name_prefix : '',
>>          install : true)
>> 
>> +executable('intel_gpu_top', 'intel_gpu_top.c',
>> +   install : true,
>> +   install_rpath : rpathdir,
>> +   dependencies : tool_deps + [ lib_igt_perf ])
>> +
>>   conf_data = configuration_data()
>>   conf_data.set('prefix', prefix)
>>   conf_data.set('exec_prefix', '${prefix}')
>> 
>_______________________________________________
>igt-dev mailing list
>igt-dev@lists.freedesktop.org
>https://lists.freedesktop.org/mailman/listinfo/igt-dev


---
Rinat
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v6] intel-gpu-top: Rewrite the tool to be safe to use
  2018-04-04 15:26     ` [igt-dev] " Tvrtko Ursulin
@ 2018-05-29  9:58       ` Matthew Auld
  -1 siblings, 0 replies; 57+ messages in thread
From: Matthew Auld @ 2018-05-29  9:58 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev, Intel Graphics Development

On 4 April 2018 at 16:26, Tvrtko Ursulin <tursulin@ursulin.net> wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
>
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
>
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
>
> v2:
>  * Sort engines by class and instance.
>  * Do not wait for one sampling period to display something on screen.
>  * Move code out of the asserts. (Rinat Ibragimov)
>  * Continuously adapt to terminal size. (Rinat Ibragimov)
>
> v3:
>  * Change layout and precision of some field. (Chris Wilson)
>  Eero Tamminen:
>  * Use more user friendly engine names.
>  * Don't error out if a counter is missing.
>  * Add IMC read/write bandwidth.
>  * Report minimum required kernel version.
>
> v4:
>  * Really support 4.16 by skipping of missing engines.
>  * Simpler and less hacky float printing.
>  * Preserve copyright header. (Antonio Argenziano)
>  * Simplify engines_ptr macro. (Rinat Ibragimov)
>
> v5:
>  * Get RAPL unit from sysfs.
>  * Consolidate sysfs paths with a macro.
>  * Tidy error handling by carrying over and reporting errno.
>  * Check against console height on all prints.
>  * More readable minimum kernel version message. (Eero Tamminen)
>  * Column banner for per engine stats. (Eero Tamminen)
>
> v6:
>  * Man page update. (Eero Tamminen)
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
> ---
>  lib/igt_perf.c        |    6 +
>  lib/igt_perf.h        |    1 +
>  man/intel_gpu_top.rst |   41 +-
>  tools/Makefile.am     |    2 +
>  tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
>  tools/meson.build     |    6 +-
>  6 files changed, 719 insertions(+), 587 deletions(-)
>
> diff --git a/lib/igt_perf.c b/lib/igt_perf.c
> index 99d82ea51c9b..e3dec2cc29c7 100644
> --- a/lib/igt_perf.c
> +++ b/lib/igt_perf.c
> @@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
>         return _perf_open(type, config, -1,
>                           PERF_FORMAT_TOTAL_TIME_ENABLED);
>  }
> +
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group)
> +{
> +       return _perf_open(type, config, group,
> +                         PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
> +}
> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
> index 614ea5d23fa6..e00718f4769a 100644
> --- a/lib/igt_perf.h
> +++ b/lib/igt_perf.h
> @@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
>  int perf_i915_open(uint64_t config);
>  int perf_i915_open_group(uint64_t config, int group);
>  int igt_perf_open(uint64_t type, uint64_t config);
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group);
>
>  #endif /* I915_PERF_H */
> diff --git a/man/intel_gpu_top.rst b/man/intel_gpu_top.rst
> index a5f7175bb1a0..19c712307d28 100644
> --- a/man/intel_gpu_top.rst
> +++ b/man/intel_gpu_top.rst
> @@ -7,9 +7,9 @@ Display a top-like summary of Intel GPU usage
>  ---------------------------------------------
>  .. include:: defs.rst
>  :Author: IGT Developers <igt-dev@lists.freedesktop.org>
> -:Date: 2016-03-01
> +:Date: 2018-04-04
>  :Version: |PACKAGE_STRING|
> -:Copyright: 2009,2011,2012,2016 Intel Corporation
> +:Copyright: 2009,2011,2012,2016,2018 Intel Corporation
>  :Manual section: |MANUAL_SECTION|
>  :Manual group: |MANUAL_GROUP|
>
> @@ -21,42 +21,25 @@ SYNOPSIS
>  DESCRIPTION
>  ===========
>
> -**intel_gpu_top** is a tool to display usage information of an Intel GPU. It
> -requires root privilege to map the graphics device.
> +**intel_gpu_top** is a tool to display usage information on Intel GPU's.
> +
> +The tool gathers data using perf performance counters (PMU) exposed by i915 and other platform drivers like RAPL (power) and Uncore IMC (memory bandwidth).
>
>  OPTIONS
>  =======
>
> --s SAMPLES
> -    Number of samples to acquire per second.
> -
> --o FILE
> -    Collect usage statistics to FILE. If file is "-", run non-interactively
> -    and output statistics to stdout.
> -
> --e COMMAND
> -    Execute COMMAND to profile, and leave when it is finished. Note that the
> -    entire command with all parameters should be included as one parameter.
> +-s <ms>
> +    Refresh period in milliseconds.
>
>  -h
> -    Show usage notes.
> +    Show help text.
>
> -EXAMPLES
> -========
> -
> -intel_gpu_top -o "cairo-trace-gvim.log" -s 100 -e "cairo-perf-trace /tmp/gvim"
> -    Run cairo-perf-trace with /tmp/gvim trace, non-interactively, saving the
> -    statistics into cairo-trace-gvim.log file, and collecting 100 samples per
> -    second.
> -
> -Note that idle units are not displayed, so an entirely idle GPU will only
> -display the ring status and header.
> +LIMITATIONS
> +===========
>
> -BUGS
> -====
> +* Not all metrics are supported on all platforms. Where a metric is unsupported it's value will be replaced by a dashed line.

s/it's/its/

>
> -Some GPUs report some units as busy when they aren't, such that even when idle
> -and not hung, it will show up as 100% busy.
> +* Non-root access to perf counters is controlled by the *perf_event_paranoid* sysctl.
>
>  REPORTING BUGS
>  ==============
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>  intel_aubdump_la_SOURCES = aubdump.c
>  intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>  bin_SCRIPTS = intel_aubdump
>  CLEANFILES = $(bin_SCRIPTS)
>
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..b923c3cfbe97 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>  /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2007-2018 Intel Corporation
>   *
>   * Permission is hereby granted, free of charge, to any person obtaining a
>   * copy of this software and associated documentation files (the "Software"),
> @@ -24,695 +23,832 @@
>   * Authors:
>   *    Eric Anholt <eric@anholt.net>
>   *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
>   */
>
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>  #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>  #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE         0xA18C
> -#define  FORCEWAKE_ACK     0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)          IS_965(devid)
> -
> -struct top_bit {
> -       struct instdone_bit *bit;
> -       int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -       " ",
> -       "▏",
> -       "▎",
> -       "▍",
> -       "▌",
> -       "▋",
> -       "▊",
> -       "▉",
> -       "█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>
> -enum stats_counts {
> -       IA_VERTICES,
> -       IA_PRIMITIVES,
> -       VS_INVOCATION,
> -       GS_INVOCATION,
> -       GS_PRIMITIVES,
> -       CL_INVOCATION,
> -       CL_PRIMITIVES,
> -       PS_INVOCATION,
> -       PS_DEPTH,
> -       STATS_COUNT
> +struct pmu_pair {
> +       uint64_t cur;
> +       uint64_t prev;
>  };
>
> -const uint32_t stats_regs[STATS_COUNT] = {
> -       IA_VERTICES_COUNT_QW,
> -       IA_PRIMITIVES_COUNT_QW,
> -       VS_INVOCATION_COUNT_QW,
> -       GS_INVOCATION_COUNT_QW,
> -       GS_PRIMITIVES_COUNT_QW,
> -       CL_INVOCATION_COUNT_QW,
> -       CL_PRIMITIVES_COUNT_QW,
> -       PS_INVOCATION_COUNT_QW,
> -       PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +       bool present;
> +       uint64_t config;
> +       unsigned int idx;
> +       struct pmu_pair val;
>  };
>
> -const char *stats_reg_names[STATS_COUNT] = {
> -       "vert fetch",
> -       "prim fetch",
> -       "VS invocations",
> -       "GS invocations",
> -       "GS prims",
> -       "CL invocations",
> -       "CL prims",
> -       "PS invocations",
> -       "PS depth pass",
> +struct engine {
> +       const char *name;
> +       const char *display_name;
> +
> +       unsigned int class;
> +       unsigned int instance;
> +
> +       unsigned int num_counters;
> +
> +       struct pmu_counter busy;
> +       struct pmu_counter wait;
> +       struct pmu_counter sema;
>  };
>
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +       unsigned int num_engines;
> +       unsigned int num_counters;
> +       DIR *root;
> +       int fd;
> +       struct pmu_pair ts;
> +
> +       int rapl_fd;
> +       double rapl_scale;
> +       const char *rapl_unit;
> +
> +       int imc_fd;
> +       double imc_reads_scale;
> +       const char *imc_reads_unit;
> +       double imc_writes_scale;
> +       const char *imc_writes_unit;
> +
> +       struct pmu_counter freq_req;
> +       struct pmu_counter freq_act;
> +       struct pmu_counter irq;
> +       struct pmu_counter rc6;
> +       struct pmu_counter rapl;
> +       struct pmu_counter imc_reads;
> +       struct pmu_counter imc_writes;
> +
> +       struct engine engine;
> +};
>
> -static unsigned long
> -gettime(void)
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>  {
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +       char buf[128], *p;
> +       int fd, ret;
>
> -static int
> -top_bits_sort(const void *a, const void *b)
> -{
> -       struct top_bit * const *bit_a = a;
> -       struct top_bit * const *bit_b = b;
> -       int a_count = (*bit_a)->count;
> -       int b_count = (*bit_b)->count;
> +       ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +       if (ret < 0 || ret == sizeof(buf))
> +               return -1;
>
> -       if (a_count < b_count)
> -               return 1;
> -       else if (a_count == b_count)
> -               return 0;
> -       else
> +       fd = openat(dirfd, buf, O_RDONLY);
> +       if (fd < 0)
>                 return -1;
> -}
>
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -       uint32_t reg_val;
> +       ret = read(fd, buf, sizeof(buf));
> +       close(fd);
> +       if (ret <= 0)
> +               return -1;
>
> -       if (top_bit->bit->reg == INSTDONE_1)
> -               reg_val = instdone1;
> -       else
> -               reg_val = instdone;
> +       p = index(buf, '0');
> +       if (!p)
> +               return -1;
>
> -       if ((reg_val & top_bit->bit->bit) == 0)
> -               top_bit->count++;
> +       return strtoul(p, NULL, 0);
>  }
>
> -static void
> -print_clock(const char *name, int clock) {
> -       if (clock == -1)
> -               printf("%s clock: unknown", name);
> +#define engine_ptr(engines, n) (&engines->engine + (n))
> +
> +static const char *class_display_name(unsigned int class)
> +{
> +       switch (class) {
> +       case I915_ENGINE_CLASS_RENDER:
> +               return "Render/3D";
> +       case I915_ENGINE_CLASS_COPY:
> +               return "Blitter";
> +       case I915_ENGINE_CLASS_VIDEO:
> +               return "Video";
> +       case I915_ENGINE_CLASS_VIDEO_ENHANCE:
> +               return "VideoEnhance";
> +       default:
> +               return "[unknown]";
> +       }
> +}
> +
> +static int engine_cmp(const void *__a, const void *__b)
> +{
> +       const struct engine *a = (struct engine *)__a;
> +       const struct engine *b = (struct engine *)__b;
> +
> +       if (a->class != b->class)
> +               return a->class - b->class;
>         else
> -               printf("%s clock: %d Mhz", name, clock);
> +               return a->instance - b->instance;
>  }
>
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +static struct engines *discover_engines(void)
>  {
> -       uint32_t devid = pci_dev->device_id;
> -       uint16_t gcfgc;
> +       const char *sysfs_root = "/sys/devices/i915/events";
> +       struct engines *engines;
> +       struct dirent *dent;
> +       int ret = 0;
> +       DIR *d;
>
> -       if (IS_GM45(devid)) {
> -               int core_clock = -1;
> +       engines = malloc(sizeof(struct engines));
> +       if (!engines)
> +               return NULL;
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +       memset(engines, 0, sizeof(*engines));
>
> -               switch (gcfgc & 0xf) {
> -               case 8:
> -                       core_clock = 266;
> -                       break;
> -               case 9:
> -                       core_clock = 320;
> -                       break;
> -               case 11:
> -                       core_clock = 400;
> -                       break;
> -               case 13:
> -                       core_clock = 533;
> -                       break;
> -               }
> -               print_clock("core", core_clock);
> -       } else if (IS_965(devid) && IS_MOBILE(devid)) {
> -               int render_clock = -1, sampler_clock = -1;
> +       engines->num_engines = 0;
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +       d = opendir(sysfs_root);
> +       if (!d)
> +               return NULL;
>
> -               switch (gcfgc & 0xf) {
> -               case 2:
> -                       render_clock = 250; sampler_clock = 267;
> -                       break;
> -               case 3:
> -                       render_clock = 320; sampler_clock = 333;
> -                       break;
> -               case 4:
> -                       render_clock = 400; sampler_clock = 444;
> -                       break;
> -               case 5:
> -                       render_clock = 500; sampler_clock = 533;
> +       while ((dent = readdir(d)) != NULL) {
> +               const char *endswith = "-busy";
> +               const unsigned int endlen = strlen(endswith);
> +               struct engine *engine =
> +                               engine_ptr(engines, engines->num_engines);
> +               char buf[256];
> +
> +               if (dent->d_type != DT_REG)
> +                       continue;
> +
> +               if (strlen(dent->d_name) >= sizeof(buf)) {
> +                       ret = ENAMETOOLONG;
>                         break;
>                 }
>
> -               print_clock("render", render_clock);
> -               printf("  ");
> -               print_clock("sampler", sampler_clock);
> -       } else if (IS_945(devid) && IS_MOBILE(devid)) {
> -               int render_clock = -1, display_clock = -1;
> +               strcpy(buf, dent->d_name);
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +               /* xxxN-busy */
> +               if (strlen(buf) < (endlen + 4))
> +                       continue;
> +               if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +                       continue;
>
> -               switch (gcfgc & 0x7) {
> -               case 0:
> -                       render_clock = 166;
> -                       break;
> -               case 1:
> -                       render_clock = 200;
> -                       break;
> -               case 3:
> -                       render_clock = 250;
> -                       break;
> -               case 5:
> -                       render_clock = 400;
> +               memset(engine, 0, sizeof(*engine));
> +
> +               buf[strlen(buf) - endlen] = 0;
> +               engine->name = strdup(buf);
> +               if (!engine->name) {
> +                       ret = errno;
>                         break;
>                 }
>
> -               switch (gcfgc & 0x70) {
> -               case 0:
> -                       display_clock = 200;
> -                       break;
> -               case 4:
> -                       display_clock = 320;
> +               engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +                                                    "busy");
> +               if (engine->busy.config == -1) {
> +                       ret = ENOENT;
>                         break;
>                 }
> -               if (gcfgc & (1 << 7))
> -                   display_clock = 133;
>
> -               print_clock("render", render_clock);
> -               printf("  ");
> -               print_clock("display", display_clock);
> -       } else if (IS_915(devid) && IS_MOBILE(devid)) {
> -               int render_clock = -1, display_clock = -1;
> +               engine->class = (engine->busy.config &
> +                                (__I915_PMU_OTHER(0) - 1)) >>
> +                               I915_PMU_CLASS_SHIFT;
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +               engine->instance = (engine->busy.config >>
> +                                   I915_PMU_SAMPLE_BITS) &
> +                                   ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>
> -               switch (gcfgc & 0x7) {
> -               case 0:
> -                       render_clock = 160;
> -                       break;
> -               case 1:
> -                       render_clock = 190;
> -                       break;
> -               case 4:
> -                       render_clock = 333;
> +               ret = snprintf(buf, sizeof(buf), "%s/%u",
> +                              class_display_name(engine->class),
> +                              engine->instance);
> +               if (ret < 0 || ret == sizeof(buf)) {
> +                       ret = ENOBUFS;
>                         break;
>                 }
> -               if (gcfgc & (1 << 13))
> -                   render_clock = 133;
> +               ret = 0;
>
> -               switch (gcfgc & 0x70) {
> -               case 0:
> -                       display_clock = 190;
> +               engine->display_name = strdup(buf);
> +               if (!engine->display_name) {
> +                       ret = errno;
>                         break;
> -               case 4:
> -                       display_clock = 333;
> +               }
> +
> +               engines->num_engines++;
> +               engines = realloc(engines, sizeof(struct engines) +
> +                                 engines->num_engines * sizeof(struct engine));
> +               if (!engines) {
> +                       ret = errno;
>                         break;
>                 }
> -               if (gcfgc & (1 << 7))
> -                   display_clock = 133;
> +       }
> +
> +       if (ret) {
> +               free(engines);
> +               errno = ret;
>
> -               print_clock("render", render_clock);
> -               printf("  ");
> -               print_clock("display", display_clock);
> +               return NULL;
>         }
>
> +       qsort(engine_ptr(engines, 0), engines->num_engines,
> +             sizeof(struct engine), engine_cmp);
> +
> +       engines->root = d;
>
> -       printf("\n");
> -       return -1;
> +       return engines;
>  }
>
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END     (79 - STATS_LEN)
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +       int fd, err;
> +       ssize_t ret;
>
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +       fd = open(filename, O_RDONLY);
> +       if (fd < 0)
> +               return -1;
> +
> +       ret = read(fd, buf, bufsize - 1);
> +       err = errno;
> +       close(fd);
> +       if (ret < 1) {
> +               errno = ret < 0 ? err : ENOMSG;
> +
> +               return -1;
> +       }
> +
> +       if (ret > 1 && buf[ret - 1] == '\n')
> +               buf[ret - 1] = '\0';
> +       else
> +               buf[ret] = '\0';
> +
> +       return 0;
> +}
> +
> +static uint64_t filename_to_u64(const char *filename, int base)
>  {
> -       int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -       int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -       int i;
> +       char buf[64], *b;
>
> -       for (i = bar_len; i >= 8; i -= 8) {
> -               printf("%s", bars[8]);
> -               cur_line_len++;
> +       if (filename_to_buf(filename, buf, sizeof(buf)))
> +               return 0;
> +
> +       /*
> +        * Handle both single integer and key=value formats by skipping
> +        * leading non-digits.
> +        */
> +       b = buf;
> +       while (*b && !isdigit(*b))
> +               b++;
> +
> +       return strtoull(b, NULL, base);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +       char *oldlocale;
> +       char buf[80];
> +       double v;
> +
> +       if (filename_to_buf(filename, buf, sizeof(buf)))
> +               return 0;
> +
> +       oldlocale = setlocale(LC_ALL, "C");
> +       v = strtod(buf, NULL);
> +       setlocale(LC_ALL, oldlocale);
> +
> +       return v;
> +}
> +
> +#define RAPL_ROOT "/sys/devices/power/"
> +#define RAPL_EVENT "/sys/devices/power/events/"
> +
> +static uint64_t rapl_type_id(void)
> +{
> +       return filename_to_u64(RAPL_ROOT "type", 10);
> +}
> +
> +static uint64_t rapl_gpu_power(void)
> +{
> +       return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
> +}
> +
> +static double rapl_gpu_power_scale(void)
> +{
> +       return filename_to_double(RAPL_EVENT "energy-gpu.scale");
> +}
> +
> +static const char *rapl_gpu_power_unit(void)
> +{
> +       char buf[32];
> +
> +       if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
> +                           buf, sizeof(buf)) == 0)
> +               if (!strcmp(buf, "Joules"))
> +                       return strdup("Watts");
> +               else
> +                       return strdup(buf);
> +       else
> +               return NULL;
> +}
> +
> +#define IMC_ROOT "/sys/devices/uncore_imc/"
> +#define IMC_EVENT "/sys/devices/uncore_imc/events/"
> +
> +static uint64_t imc_type_id(void)
> +{
> +       return filename_to_u64(IMC_ROOT "type", 10);
> +}
> +
> +static uint64_t imc_data_reads(void)
> +{
> +       return filename_to_u64(IMC_EVENT "data_reads", 0);
> +}
> +
> +static double imc_data_reads_scale(void)
> +{
> +       return filename_to_double(IMC_EVENT "data_reads.scale");
> +}
> +
> +static const char *imc_data_reads_unit(void)
> +{
> +       char buf[32];
> +
> +       if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
> +               return strdup(buf);
> +       else
> +               return NULL;
> +}
> +
> +static uint64_t imc_data_writes(void)
> +{
> +       return filename_to_u64(IMC_EVENT "data_writes", 0);
> +}
> +
> +static double imc_data_writes_scale(void)
> +{
> +       return filename_to_double(IMC_EVENT "data_writes.scale");
> +}
> +
> +static const char *imc_data_writes_unit(void)
> +{
> +       char buf[32];
> +
> +       if (filename_to_buf(IMC_EVENT "data_writes.unit",
> +                           buf, sizeof(buf)) == 0)
> +               return strdup(buf);
> +       else
> +               return NULL;
> +}
> +
> +#define _open_pmu(cnt, pmu, fd) \
> +({ \
> +       int fd__; \
> +\
> +       fd__ = perf_i915_open_group((pmu)->config, (fd)); \
> +       if (fd__ >= 0) { \
> +               if ((fd) == -1) \
> +                       (fd) = fd__; \
> +               (pmu)->present = true; \
> +               (pmu)->idx = (cnt)++; \
> +       } \
> +\
> +       fd__; \
> +})
> +
> +#define _open_imc(cnt, pmu, fd) \
> +({ \
> +       int fd__; \
> +\
> +       fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
> +       if (fd__ >= 0) { \
> +               if ((fd) == -1) \
> +                       (fd) = fd__; \
> +               (pmu)->present = true; \
> +               (pmu)->idx = (cnt)++; \
> +       } \
> +\
> +       fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
> +{
> +       unsigned int i;
> +       int fd;
> +
> +       engines->fd = -1;
> +       engines->num_counters = 0;
> +
> +       engines->irq.config = I915_PMU_INTERRUPTS;
> +       fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
> +       if (fd < 0)
> +               return -1;
> +
> +       engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +       _open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
> +
> +       engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +       _open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
> +
> +       engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +       _open_pmu(engines->num_counters, &engines->rc6, engines->fd);
> +
> +       for (i = 0; i < engines->num_engines; i++) {
> +               struct engine *engine = engine_ptr(engines, i);
> +               struct {
> +                       struct pmu_counter *pmu;
> +                       const char *counter;
> +               } *cnt, counters[] = {
> +                       { .pmu = &engine->busy, .counter = "busy" },
> +                       { .pmu = &engine->wait, .counter = "wait" },
> +                       { .pmu = &engine->sema, .counter = "sema" },
> +                       { .pmu = NULL, .counter = NULL },
> +               };
> +
> +               for (cnt = counters; cnt->pmu; cnt++) {
> +                       if (!cnt->pmu->config)
> +                               cnt->pmu->config =
> +                                       get_pmu_config(dirfd(engines->root),
> +                                                      engine->name,
> +                                                      cnt->counter);
> +                       fd = _open_pmu(engines->num_counters, cnt->pmu,
> +                                      engines->fd);
> +                       if (fd >= 0)
> +                               engine->num_counters++;
> +               }
>         }
> -       if (i) {
> -               printf("%s", bars[i]);
> -               cur_line_len++;
> +
> +       engines->rapl_fd = -1;
> +       if (rapl_type_id()) {
> +               engines->rapl_scale = rapl_gpu_power_scale();
> +               engines->rapl_unit = rapl_gpu_power_unit();
> +               if (!engines->rapl_unit)
> +                       return -1;
> +
> +               engines->rapl.config = rapl_gpu_power();
> +               if (!engines->rapl.config)
> +                       return -1;
> +
> +               engines->rapl_fd = igt_perf_open(rapl_type_id(),
> +                                                engines->rapl.config);
> +               if (engines->rapl_fd < 0)
> +                       return -1;
> +
> +               engines->rapl.present = true;
>         }
>
> -       /* NB: We can't use a field width with utf8 so we manually
> -       * guarantee a field with of 45 chars for any bar. */
> -       printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> -}
> +       engines->imc_fd = -1;
> +       if (imc_type_id()) {
> +               unsigned int num = 0;
>
> -struct ring {
> -       const char *name;
> -       uint32_t mmio;
> -       int head, tail, size;
> -       uint64_t full;
> -       int idle;
> -};
> +               engines->imc_reads_scale = imc_data_reads_scale();
> +               engines->imc_writes_scale = imc_data_writes_scale();
> +
> +               engines->imc_reads_unit = imc_data_reads_unit();
> +               if (!engines->imc_reads_unit)
> +                       return -1;
> +
> +               engines->imc_writes_unit = imc_data_writes_unit();
> +               if (!engines->imc_writes_unit)
> +                       return -1;
> +
> +               engines->imc_reads.config = imc_data_reads();
> +               if (!engines->imc_reads.config)
> +                       return -1;
> +
> +               engines->imc_writes.config = imc_data_writes();
> +               if (!engines->imc_writes.config)
> +                       return -1;
> +
> +               fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
> +               if (fd < 0)
> +                       return -1;
> +               fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
> +               if (fd < 0)
> +                       return -1;
> +
> +               engines->imc_reads.present = true;
> +               engines->imc_writes.present = true;

We can drop these.

> +       }
> +
> +       return 0;
> +}
>
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>  {
> -       return INREG(ring->mmio + reg);
> +       uint64_t buf[2 + num];
> +       unsigned int i;
> +       ssize_t len;
> +
> +       memset(buf, 0, sizeof(buf));
> +
> +       len = read(fd, buf, sizeof(buf));
> +       assert(len == sizeof(buf));
> +
> +       for (i = 0; i < num; i++)
> +               val[i] = buf[2 + i];
> +
> +       return buf[1];
>  }
>
> -static void ring_init(struct ring *ring)
> +static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
>  {
> -       ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +       double v;
> +
> +       v = p->cur - p->prev;
> +       v /= d;
> +       v /= t;
> +       v *= s;
> +
> +       if (s == 100.0 && v > 100.0)
> +               v = 100.0;
> +
> +       return v;
>  }
>
> -static void ring_reset(struct ring *ring)
> +static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
>  {
> -       ring->idle = ring->full = 0;
> +       unsigned int i;
> +
> +       for (i = 0; i < num && i < (bufsz - 1); i++)
> +               *buf++ = c;
> +
> +       *buf = 0;
>  }
>
> -static void ring_sample(struct ring *ring)
> +static void pmu_calc(struct pmu_counter *cnt,
> +                    char *buf, unsigned int bufsz,
> +                    unsigned int width, unsigned width_dec,
> +                    double d, double t, double s)
>  {
> -       int full;
> +       double val;
> +       int len;
> +
> +       assert(bufsz >= (width + width_dec + 1));
> +
> +       if (!cnt->present) {
> +               fill_str(buf, bufsz, '-', width + width_dec);
> +               return;
> +       }
>
> -       if (!ring->size)
> +       val = __pmu_calc(&cnt->val, d, t, s);
> +
> +       len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
> +       if (len < 0 || len == bufsz) {
> +               fill_str(buf, bufsz, 'X', width + width_dec);
>                 return;
> +       }
> +}
> +
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
> +{
> +       uint64_t data[2] = { };
> +       ssize_t len;
>
> -       ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -       ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +       len = read(fd, data, sizeof(data));
> +       assert(len == sizeof(data));
>
> -       if (ring->tail == ring->head)
> -               ring->idle++;
> +       if (ts)
> +               *ts = data[1];
>
> -       full = ring->tail - ring->head;
> -       if (full < 0)
> -               full += ring->size;
> -       ring->full += full;
> +       return data[0];
>  }
>
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static uint64_t pmu_read_single(int fd)
>  {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +       return __pmu_read_single(fd, NULL);
>  }
>
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>  {
> -       int percent_busy, len;
> +       counter->val.prev = counter->val.cur;
> +       counter->val.cur = val;
> +}
>
> -       if (!ring->size)
> -               return;
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
> +{
> +       if (counter->present)
> +               __update_sample(counter, val[counter->idx]);
> +}
> +
> +static void pmu_sample(struct engines *engines)
> +{
> +       const int num_val = engines->num_counters;
> +       uint64_t val[2 + num_val];
> +       unsigned int i;
> +
> +       engines->ts.prev = engines->ts.cur;
> +
> +       if (engines->rapl_fd >= 0)
> +               __update_sample(&engines->rapl,
> +                               pmu_read_single(engines->rapl_fd));
> +
> +       if (engines->imc_fd >= 0) {
> +               pmu_read_multi(engines->imc_fd, 2, val);
> +               update_sample(&engines->imc_reads, val);
> +               update_sample(&engines->imc_writes, val);
> +       }
>
> -       percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +       engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>
> -       len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -       print_percentage_bar (percent_busy, len);
> -       printf("%24s space: %d/%d\n",
> -                  ring->name,
> -                  (int)(ring->full / samples_per_sec),
> -                  ring->size);
> +       update_sample(&engines->freq_req, val);
> +       update_sample(&engines->freq_act, val);
> +       update_sample(&engines->irq, val);
> +       update_sample(&engines->rc6, val);
> +
> +       for (i = 0; i < engines->num_engines; i++) {
> +               struct engine *engine = engine_ptr(engines, i);
> +
> +               update_sample(&engine->busy, val);
> +               update_sample(&engine->sema, val);
> +               update_sample(&engine->wait, val);
> +       }
>  }
>
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -               FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>  {
> -       if (ring->size)
> -               fprintf(output, "%3d\t%d\t",
> -                       (int)(100 - 100 * ring->idle / samples_per_sec),
> -                       (int)(ring->full / samples_per_sec));
> -       else
> -               fprintf(output, "-1\t-1\t");
> +       int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +       int i;
> +
> +       putchar('|');
> +
> +       for (i = bar_len; i >= 8; i -= 8)
> +               printf("%s", bars[8]);
> +       if (i)
> +               printf("%s", bars[i]);
> +
> +       for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +               putchar(' ');
> +
> +       putchar('|');
>  }
>
> +#define DEFAULT_PERIOD_MS (1000)
> +
>  static void
>  usage(const char *appname)
>  {
>         printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -                       "\n"
> -                       "usage: %s [parameters]\n"
> -                       "\n"
> -                       "The following parameters apply:\n"
> -                       "[-s <samples>]       samples per seconds (default %d)\n"
> -                       "[-e <command>]       command to profile\n"
> -                       "[-o <file>]          output statistics to file. If file is '-',"
> -                       "                     run in batch mode and output statistics to stdio only \n"
> -                       "[-h]                 show this help screen\n"
> -                       "\n",
> -                       appname,
> -                       SAMPLES_PER_SEC
> -                 );
> -       return;
> +               "\n"
> +               "Usage: %s [parameters]\n"
> +               "\n"
> +               "\tThe following parameters are optional:\n\n"
> +               "\t[-s <ms>]       Refresh period in milliseconds (default %ums).\n"
> +               "\t[-h]            Show this help text.\n"
> +               "\n",
> +               appname, DEFAULT_PERIOD_MS);
>  }
>
>  int main(int argc, char **argv)
>  {
> -       uint32_t devid;
> -       struct pci_device *pci_dev;
> -       struct ring render_ring = {
> -               .name = "render",
> -               .mmio = 0x2030,
> -       }, bsd_ring = {
> -               .name = "bitstream",
> -               .mmio = 0x4030,
> -       }, bsd6_ring = {
> -               .name = "bitstream",
> -               .mmio = 0x12030,
> -       }, blt_ring = {
> -               .name = "blitter",
> -               .mmio = 0x22030,
> -       };
> -       int i, ch;
> -       int samples_per_sec = SAMPLES_PER_SEC;
> -       FILE *output = NULL;
> -       double elapsed_time=0;
> -       int print_headers=1;
> -       pid_t child_pid=-1;
> -       int child_stat;
> -       char *cmd=NULL;
> -       int interactive=1;
> -
> -       /* Parse options? */
> -       while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +       unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +       int con_w = -1, con_h = -1;
> +       struct engines *engines;
> +       unsigned int i;
> +       int ret, ch;
> +
> +       /* Parse options */
> +       while ((ch = getopt(argc, argv, "s:h")) != -1) {
>                 switch (ch) {
> -               case 'e': cmd = strdup(optarg);
> -                       break;
> -               case 's': samples_per_sec = atoi(optarg);
> -                       if (samples_per_sec < 100) {
> -                               fprintf(stderr, "Error: samples per second must be >= 100\n");
> -                               exit(1);
> -                       }
> -                       break;
> -               case 'o':
> -                       if (!strcmp(optarg, "-")) {
> -                               /* Running in non-interactive mode */
> -                               interactive = 0;
> -                               output = stdout;
> -                       }
> -                       else
> -                               output = fopen(optarg, "w");
> -                       if (!output)
> -                       {
> -                               perror("fopen");
> -                               exit(1);
> -                       }
> +               case 's':
> +                       period_us = atoi(optarg) * 1000;
>                         break;
>                 case 'h':
>                         usage(argv[0]);
>                         exit(0);
> -                       break;
>                 default:
> -                       fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +                       fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>                         usage(argv[0]);
>                         exit(1);
> -                       break;
>                 }
>         }
>
> -       pci_dev = intel_get_pci_device();
> -       devid = pci_dev->device_id;
> -       intel_mmio_use_pci_bar(pci_dev);
> -       init_instdone_definitions(devid);
> -
> -       /* Do we have a command to run? */
> -       if (cmd != NULL) {
> -               if (output) {
> -                       fprintf(output, "# Profiling: %s\n", cmd);
> -                       fflush(output);
> -               }
> -               child_pid = fork();
> -               if (child_pid < 0) {
> -                       perror("fork");
> -                       exit(1);
> -               }
> -               else if (child_pid == 0) {
> -                       int res;
> -                       res = system(cmd);
> -                       if (res < 0)
> -                               perror("running command");
> -                       if (output) {
> -                               fflush(output);
> -                               fprintf(output, "# %s exited with status %d\n", cmd, res);
> -                               fflush(output);
> -                       }
> -                       free(cmd);
> -                       exit(0);
> -               } else {
> -                       free(cmd);
> -               }
> +       engines = discover_engines();
> +       if (!engines) {
> +               fprintf(stderr,
> +                       "Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
> +                       strerror(errno));
> +               return 1;
>         }
>
> -       for (i = 0; i < num_instdone_bits; i++) {
> -               top_bits[i].bit = &instdone_bits[i];
> -               top_bits[i].count = 0;
> -               top_bits_sorted[i] = &top_bits[i];
> +       ret = pmu_init(engines);
> +       if (ret) {
> +               fprintf(stderr,
> +                       "Failed to initialize PMU! (%s)\n", strerror(errno));
> +               return 1;
>         }
>
> -       /* Grab access to the registers */
> -       intel_register_access_init(pci_dev, 0, -1);
> +       pmu_sample(engines);
>
> -       ring_init(&render_ring);
> -       if (IS_GEN4(devid) || IS_GEN5(devid))
> -               ring_init(&bsd_ring);
> -       if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -               ring_init(&bsd6_ring);
> -               ring_init(&blt_ring);
> -       }
> +       for (;;) {
> +               double t;
> +#define BUFSZ 16
> +               char freq[BUFSZ];
> +               char fact[BUFSZ];
> +               char irq[BUFSZ];
> +               char rc6[BUFSZ];
> +               char power[BUFSZ];
> +               char reads[BUFSZ];
> +               char writes[BUFSZ];
> +               struct winsize ws;
> +               int lines = 0;
>
> -       /* Initialize GPU stats */
> -       if (HAS_STATS_REGS(devid)) {
> -               for (i = 0; i < STATS_COUNT; i++) {
> -                       uint32_t stats_high, stats_low, stats_high_2;
> +               /* Update terminal size. */
> +               if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +                       con_w = ws.ws_col;
> +                       con_h = ws.ws_row;
> +               }
>
> -                       do {
> -                               stats_high = INREG(stats_regs[i] + 4);
> -                               stats_low = INREG(stats_regs[i]);
> -                               stats_high_2 = INREG(stats_regs[i] + 4);
> -                       } while (stats_high != stats_high_2);
> +               pmu_sample(engines);
> +               t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>
> -                       last_stats[i] = (uint64_t)stats_high << 32 |
> -                               stats_low;
> -               }
> -       }
> +               printf("\033[H\033[J");
>
> -       for (;;) {
> -               int j;
> -               unsigned long long t1, ti, tf, t2;
> -               unsigned long long def_sleep = 1000000 / samples_per_sec;
> -               unsigned long long last_samples_per_sec = samples_per_sec;
> -               unsigned short int max_lines;
> -               struct winsize ws;
> -               char clear_screen[] = {0x1b, '[', 'H',
> -                                      0x1b, '[', 'J',
> -                                      0x0};
> -               int percent;
> -               int len;
> -
> -               t1 = gettime();
> -
> -               ring_reset(&render_ring);
> -               ring_reset(&bsd_ring);
> -               ring_reset(&bsd6_ring);
> -               ring_reset(&blt_ring);
> -
> -               for (i = 0; i < samples_per_sec; i++) {
> -                       long long interval;
> -                       ti = gettime();
> -                       if (IS_965(devid)) {
> -                               instdone = INREG(INSTDONE_I965);
> -                               instdone1 = INREG(INSTDONE_1);
> -                       } else
> -                               instdone = INREG(INSTDONE);
> -
> -                       for (j = 0; j < num_instdone_bits; j++)
> -                               update_idle_bit(&top_bits[j]);
> -
> -                       ring_sample(&render_ring);
> -                       ring_sample(&bsd_ring);
> -                       ring_sample(&bsd6_ring);
> -                       ring_sample(&blt_ring);
> -
> -                       tf = gettime();
> -                       if (tf - t1 >= 1000000) {
> -                               /* We are out of sync, bail out */
> -                               last_samples_per_sec = i+1;
> -                               break;
> -                       }
> -                       interval = def_sleep - (tf - ti);
> -                       if (interval > 0)
> -                               usleep(interval);
> -               }
> +               pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
> +               pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
> +               pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
> +               pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
> +               pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
> +                        engines->rapl_scale);
> +               pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
> +                        engines->imc_reads_scale);
> +               pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
> +                        engines->imc_writes_scale);
>
> -               if (HAS_STATS_REGS(devid)) {
> -                       for (i = 0; i < STATS_COUNT; i++) {
> -                               uint32_t stats_high, stats_low, stats_high_2;
> +               if (lines++ < con_h)
> +                       printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
> +                              fact, freq, rc6, power, engines->rapl_unit, irq);
>
> -                               do {
> -                                       stats_high = INREG(stats_regs[i] + 4);
> -                                       stats_low = INREG(stats_regs[i]);
> -                                       stats_high_2 = INREG(stats_regs[i] + 4);
> -                               } while (stats_high != stats_high_2);
> +               if (lines++ < con_h)
> +                       printf("\n");
>
> -                               stats[i] = (uint64_t)stats_high << 32 |
> -                                       stats_low;
> -                       }
> -               }
> +               if (engines->imc_fd) {
> +                       if (lines++ < con_h)
> +                               printf("      IMC reads:   %s %s/s\n",
> +                                      reads, engines->imc_reads_unit);
> +
> +                       if (lines++ < con_h)
> +                               printf("     IMC writes:   %s %s/s\n",
> +                                      writes, engines->imc_writes_unit);
>
> -               qsort(top_bits_sorted, num_instdone_bits,
> -                     sizeof(struct top_bit *), top_bits_sort);
> -
> -               /* Limit the number of lines printed to the terminal height so the
> -                * most important info (at the top) will stay on screen. */
> -               max_lines = -1;
> -               if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -                       max_lines = ws.ws_row - 6; /* exclude header lines */
> -               if (max_lines >= num_instdone_bits)
> -                       max_lines = num_instdone_bits;
> -
> -               t2 = gettime();
> -               elapsed_time += (t2 - t1) / 1000000.0;
> -
> -               if (interactive) {
> -                       printf("%s", clear_screen);
> -                       print_clock_info(pci_dev);
> -
> -                       ring_print(&render_ring, last_samples_per_sec);
> -                       ring_print(&bsd_ring, last_samples_per_sec);
> -                       ring_print(&bsd6_ring, last_samples_per_sec);
> -                       ring_print(&blt_ring, last_samples_per_sec);
> -
> -                       printf("\n%30s  %s\n", "task", "percent busy");
> -                       for (i = 0; i < max_lines; i++) {
> -                               if (top_bits_sorted[i]->count > 0) {
> -                                       percent = (top_bits_sorted[i]->count * 100) /
> -                                               last_samples_per_sec;
> -                                       len = printf("%30s: %3d%%: ",
> -                                                        top_bits_sorted[i]->bit->name,
> -                                                        percent);
> -                                       print_percentage_bar (percent, len);
> -                               } else {
> -                                       printf("%*s", PERCENTAGE_BAR_END, "");
> -                               }
> -
> -                               if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -                                       printf("%13s: %llu (%lld/sec)",
> -                                                  stats_reg_names[i],
> -                                                  (long long)stats[i],
> -                                                  (long long)(stats[i] - last_stats[i]));
> -                                       last_stats[i] = stats[i];
> -                               } else {
> -                                       if (!top_bits_sorted[i]->count)
> -                                               break;
> -                               }
> +                       if (++lines < con_h)
>                                 printf("\n");
> -                       }
>                 }
> -               if (output) {
> -                       /* Print headers for columns at first run */
> -                       if (print_headers) {
> -                               fprintf(output, "# time\t");
> -                               ring_print_header(output, &render_ring);
> -                               ring_print_header(output, &bsd_ring);
> -                               ring_print_header(output, &bsd6_ring);
> -                               ring_print_header(output, &blt_ring);
> -                               for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -                                       if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -                                               fprintf(output, "%.6s\t",
> -                                                          stats_reg_names[i]
> -                                                          );
> -                                       }
> -                                       if (!top_bits[i].count)
> -                                               continue;
> -                               }
> -                               fprintf(output, "\n");
> -                               print_headers = 0;
> -                       }
>
> -                       /* Print statistics */
> -                       fprintf(output, "%.2f\t", elapsed_time);
> -                       ring_log(&render_ring, last_samples_per_sec, output);
> -                       ring_log(&bsd_ring, last_samples_per_sec, output);
> -                       ring_log(&bsd6_ring, last_samples_per_sec, output);
> -                       ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -                       for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -                               if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -                                       fprintf(output, "%"PRIu64"\t",
> -                                                  stats[i] - last_stats[i]);
> -                                       last_stats[i] = stats[i];
> -                               }
> -                                       if (!top_bits[i].count)
> -                                               continue;
> -                       }
> -                       fprintf(output, "\n");
> -                       fflush(output);
> -               }
> +               for (i = 0; i < engines->num_engines; i++) {
> +                       struct engine *engine = engine_ptr(engines, i);
>
> -               for (i = 0; i < num_instdone_bits; i++) {
> -                       top_bits_sorted[i]->count = 0;
> +                       if (engine->num_counters && lines < con_h) {
> +                               const char *a = "          ENGINE      BUSY ";
> +                               const char *b = " MI_SEMA MI_WAIT";
>
> -                       if (i < STATS_COUNT)
> -                               last_stats[i] = stats[i];
> +                               printf("\033[7m%s%*s%s\033[0m\n",
> +                                      a,
> +                                      (int)(con_w - 1 - strlen(a) - strlen(b)),
> +                                      " ", b);
> +                               lines++;
> +                               break;
> +                       }
>                 }
>
> -               /* Check if child has gone */
> -               if (child_pid > 0) {
> -                       int res;
> -                       if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -                               perror("waitpid");
> -                               exit(1);
> -                       }
> -                       if (res == 0)
> +               for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +                       struct engine *engine = engine_ptr(engines, i);
> +                       unsigned int max_w = con_w - 1;
> +                       unsigned int len;
> +                       char sema[BUFSZ];
> +                       char wait[BUFSZ];
> +                       char busy[BUFSZ];
> +                       char buf[128];
> +                       double val;
> +
> +                       if (!engine->num_counters)
>                                 continue;
> -                       if (WIFEXITED(child_stat))
> -                               break;
> +
> +                       pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
> +                       pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
> +                       len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
> +                                      sema, wait);
> +
> +                       pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
> +                                100);
> +                       len += printf("%16s %s%% ", engine->display_name, busy);
> +
> +                       val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
> +                       print_percentage_bar(val, max_w - len);

if (max_w > len + 2)
     print_percentage_bar(val, max_w - len);

or something? Meh.

fwiw,
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v6] intel-gpu-top: Rewrite the tool to be safe to use
@ 2018-05-29  9:58       ` Matthew Auld
  0 siblings, 0 replies; 57+ messages in thread
From: Matthew Auld @ 2018-05-29  9:58 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: igt-dev, Intel Graphics Development, Tvrtko Ursulin

On 4 April 2018 at 16:26, Tvrtko Ursulin <tursulin@ursulin.net> wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
>
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
>
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
>
> v2:
>  * Sort engines by class and instance.
>  * Do not wait for one sampling period to display something on screen.
>  * Move code out of the asserts. (Rinat Ibragimov)
>  * Continuously adapt to terminal size. (Rinat Ibragimov)
>
> v3:
>  * Change layout and precision of some field. (Chris Wilson)
>  Eero Tamminen:
>  * Use more user friendly engine names.
>  * Don't error out if a counter is missing.
>  * Add IMC read/write bandwidth.
>  * Report minimum required kernel version.
>
> v4:
>  * Really support 4.16 by skipping of missing engines.
>  * Simpler and less hacky float printing.
>  * Preserve copyright header. (Antonio Argenziano)
>  * Simplify engines_ptr macro. (Rinat Ibragimov)
>
> v5:
>  * Get RAPL unit from sysfs.
>  * Consolidate sysfs paths with a macro.
>  * Tidy error handling by carrying over and reporting errno.
>  * Check against console height on all prints.
>  * More readable minimum kernel version message. (Eero Tamminen)
>  * Column banner for per engine stats. (Eero Tamminen)
>
> v6:
>  * Man page update. (Eero Tamminen)
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Petri Latvala <petri.latvala@intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: Rinat Ibragimov <ibragimovrinat@mail.ru>
> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> # v1
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> # v0.5
> ---
>  lib/igt_perf.c        |    6 +
>  lib/igt_perf.h        |    1 +
>  man/intel_gpu_top.rst |   41 +-
>  tools/Makefile.am     |    2 +
>  tools/intel_gpu_top.c | 1250 +++++++++++++++++++++++++++----------------------
>  tools/meson.build     |    6 +-
>  6 files changed, 719 insertions(+), 587 deletions(-)
>
> diff --git a/lib/igt_perf.c b/lib/igt_perf.c
> index 99d82ea51c9b..e3dec2cc29c7 100644
> --- a/lib/igt_perf.c
> +++ b/lib/igt_perf.c
> @@ -69,3 +69,9 @@ int igt_perf_open(uint64_t type, uint64_t config)
>         return _perf_open(type, config, -1,
>                           PERF_FORMAT_TOTAL_TIME_ENABLED);
>  }
> +
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group)
> +{
> +       return _perf_open(type, config, group,
> +                         PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
> +}
> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
> index 614ea5d23fa6..e00718f4769a 100644
> --- a/lib/igt_perf.h
> +++ b/lib/igt_perf.h
> @@ -55,5 +55,6 @@ uint64_t i915_type_id(void);
>  int perf_i915_open(uint64_t config);
>  int perf_i915_open_group(uint64_t config, int group);
>  int igt_perf_open(uint64_t type, uint64_t config);
> +int igt_perf_open_group(uint64_t type, uint64_t config, int group);
>
>  #endif /* I915_PERF_H */
> diff --git a/man/intel_gpu_top.rst b/man/intel_gpu_top.rst
> index a5f7175bb1a0..19c712307d28 100644
> --- a/man/intel_gpu_top.rst
> +++ b/man/intel_gpu_top.rst
> @@ -7,9 +7,9 @@ Display a top-like summary of Intel GPU usage
>  ---------------------------------------------
>  .. include:: defs.rst
>  :Author: IGT Developers <igt-dev@lists.freedesktop.org>
> -:Date: 2016-03-01
> +:Date: 2018-04-04
>  :Version: |PACKAGE_STRING|
> -:Copyright: 2009,2011,2012,2016 Intel Corporation
> +:Copyright: 2009,2011,2012,2016,2018 Intel Corporation
>  :Manual section: |MANUAL_SECTION|
>  :Manual group: |MANUAL_GROUP|
>
> @@ -21,42 +21,25 @@ SYNOPSIS
>  DESCRIPTION
>  ===========
>
> -**intel_gpu_top** is a tool to display usage information of an Intel GPU. It
> -requires root privilege to map the graphics device.
> +**intel_gpu_top** is a tool to display usage information on Intel GPU's.
> +
> +The tool gathers data using perf performance counters (PMU) exposed by i915 and other platform drivers like RAPL (power) and Uncore IMC (memory bandwidth).
>
>  OPTIONS
>  =======
>
> --s SAMPLES
> -    Number of samples to acquire per second.
> -
> --o FILE
> -    Collect usage statistics to FILE. If file is "-", run non-interactively
> -    and output statistics to stdout.
> -
> --e COMMAND
> -    Execute COMMAND to profile, and leave when it is finished. Note that the
> -    entire command with all parameters should be included as one parameter.
> +-s <ms>
> +    Refresh period in milliseconds.
>
>  -h
> -    Show usage notes.
> +    Show help text.
>
> -EXAMPLES
> -========
> -
> -intel_gpu_top -o "cairo-trace-gvim.log" -s 100 -e "cairo-perf-trace /tmp/gvim"
> -    Run cairo-perf-trace with /tmp/gvim trace, non-interactively, saving the
> -    statistics into cairo-trace-gvim.log file, and collecting 100 samples per
> -    second.
> -
> -Note that idle units are not displayed, so an entirely idle GPU will only
> -display the ring status and header.
> +LIMITATIONS
> +===========
>
> -BUGS
> -====
> +* Not all metrics are supported on all platforms. Where a metric is unsupported it's value will be replaced by a dashed line.

s/it's/its/

>
> -Some GPUs report some units as busy when they aren't, such that even when idle
> -and not hung, it will show up as 100% busy.
> +* Non-root access to perf counters is controlled by the *perf_event_paranoid* sysctl.
>
>  REPORTING BUGS
>  ==============
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>  intel_aubdump_la_SOURCES = aubdump.c
>  intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>  bin_SCRIPTS = intel_aubdump
>  CLEANFILES = $(bin_SCRIPTS)
>
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..b923c3cfbe97 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>  /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2007-2018 Intel Corporation
>   *
>   * Permission is hereby granted, free of charge, to any person obtaining a
>   * copy of this software and associated documentation files (the "Software"),
> @@ -24,695 +23,832 @@
>   * Authors:
>   *    Eric Anholt <eric@anholt.net>
>   *    Eugeni Dodonov <eugeni.dodonov@intel.com>
> - *
>   */
>
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>  #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>  #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE         0xA18C
> -#define  FORCEWAKE_ACK     0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)          IS_965(devid)
> -
> -struct top_bit {
> -       struct instdone_bit *bit;
> -       int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -       " ",
> -       "▏",
> -       "▎",
> -       "▍",
> -       "▌",
> -       "▋",
> -       "▊",
> -       "▉",
> -       "█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>
> -enum stats_counts {
> -       IA_VERTICES,
> -       IA_PRIMITIVES,
> -       VS_INVOCATION,
> -       GS_INVOCATION,
> -       GS_PRIMITIVES,
> -       CL_INVOCATION,
> -       CL_PRIMITIVES,
> -       PS_INVOCATION,
> -       PS_DEPTH,
> -       STATS_COUNT
> +struct pmu_pair {
> +       uint64_t cur;
> +       uint64_t prev;
>  };
>
> -const uint32_t stats_regs[STATS_COUNT] = {
> -       IA_VERTICES_COUNT_QW,
> -       IA_PRIMITIVES_COUNT_QW,
> -       VS_INVOCATION_COUNT_QW,
> -       GS_INVOCATION_COUNT_QW,
> -       GS_PRIMITIVES_COUNT_QW,
> -       CL_INVOCATION_COUNT_QW,
> -       CL_PRIMITIVES_COUNT_QW,
> -       PS_INVOCATION_COUNT_QW,
> -       PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +       bool present;
> +       uint64_t config;
> +       unsigned int idx;
> +       struct pmu_pair val;
>  };
>
> -const char *stats_reg_names[STATS_COUNT] = {
> -       "vert fetch",
> -       "prim fetch",
> -       "VS invocations",
> -       "GS invocations",
> -       "GS prims",
> -       "CL invocations",
> -       "CL prims",
> -       "PS invocations",
> -       "PS depth pass",
> +struct engine {
> +       const char *name;
> +       const char *display_name;
> +
> +       unsigned int class;
> +       unsigned int instance;
> +
> +       unsigned int num_counters;
> +
> +       struct pmu_counter busy;
> +       struct pmu_counter wait;
> +       struct pmu_counter sema;
>  };
>
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +       unsigned int num_engines;
> +       unsigned int num_counters;
> +       DIR *root;
> +       int fd;
> +       struct pmu_pair ts;
> +
> +       int rapl_fd;
> +       double rapl_scale;
> +       const char *rapl_unit;
> +
> +       int imc_fd;
> +       double imc_reads_scale;
> +       const char *imc_reads_unit;
> +       double imc_writes_scale;
> +       const char *imc_writes_unit;
> +
> +       struct pmu_counter freq_req;
> +       struct pmu_counter freq_act;
> +       struct pmu_counter irq;
> +       struct pmu_counter rc6;
> +       struct pmu_counter rapl;
> +       struct pmu_counter imc_reads;
> +       struct pmu_counter imc_writes;
> +
> +       struct engine engine;
> +};
>
> -static unsigned long
> -gettime(void)
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>  {
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +       char buf[128], *p;
> +       int fd, ret;
>
> -static int
> -top_bits_sort(const void *a, const void *b)
> -{
> -       struct top_bit * const *bit_a = a;
> -       struct top_bit * const *bit_b = b;
> -       int a_count = (*bit_a)->count;
> -       int b_count = (*bit_b)->count;
> +       ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +       if (ret < 0 || ret == sizeof(buf))
> +               return -1;
>
> -       if (a_count < b_count)
> -               return 1;
> -       else if (a_count == b_count)
> -               return 0;
> -       else
> +       fd = openat(dirfd, buf, O_RDONLY);
> +       if (fd < 0)
>                 return -1;
> -}
>
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -       uint32_t reg_val;
> +       ret = read(fd, buf, sizeof(buf));
> +       close(fd);
> +       if (ret <= 0)
> +               return -1;
>
> -       if (top_bit->bit->reg == INSTDONE_1)
> -               reg_val = instdone1;
> -       else
> -               reg_val = instdone;
> +       p = index(buf, '0');
> +       if (!p)
> +               return -1;
>
> -       if ((reg_val & top_bit->bit->bit) == 0)
> -               top_bit->count++;
> +       return strtoul(p, NULL, 0);
>  }
>
> -static void
> -print_clock(const char *name, int clock) {
> -       if (clock == -1)
> -               printf("%s clock: unknown", name);
> +#define engine_ptr(engines, n) (&engines->engine + (n))
> +
> +static const char *class_display_name(unsigned int class)
> +{
> +       switch (class) {
> +       case I915_ENGINE_CLASS_RENDER:
> +               return "Render/3D";
> +       case I915_ENGINE_CLASS_COPY:
> +               return "Blitter";
> +       case I915_ENGINE_CLASS_VIDEO:
> +               return "Video";
> +       case I915_ENGINE_CLASS_VIDEO_ENHANCE:
> +               return "VideoEnhance";
> +       default:
> +               return "[unknown]";
> +       }
> +}
> +
> +static int engine_cmp(const void *__a, const void *__b)
> +{
> +       const struct engine *a = (struct engine *)__a;
> +       const struct engine *b = (struct engine *)__b;
> +
> +       if (a->class != b->class)
> +               return a->class - b->class;
>         else
> -               printf("%s clock: %d Mhz", name, clock);
> +               return a->instance - b->instance;
>  }
>
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +static struct engines *discover_engines(void)
>  {
> -       uint32_t devid = pci_dev->device_id;
> -       uint16_t gcfgc;
> +       const char *sysfs_root = "/sys/devices/i915/events";
> +       struct engines *engines;
> +       struct dirent *dent;
> +       int ret = 0;
> +       DIR *d;
>
> -       if (IS_GM45(devid)) {
> -               int core_clock = -1;
> +       engines = malloc(sizeof(struct engines));
> +       if (!engines)
> +               return NULL;
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +       memset(engines, 0, sizeof(*engines));
>
> -               switch (gcfgc & 0xf) {
> -               case 8:
> -                       core_clock = 266;
> -                       break;
> -               case 9:
> -                       core_clock = 320;
> -                       break;
> -               case 11:
> -                       core_clock = 400;
> -                       break;
> -               case 13:
> -                       core_clock = 533;
> -                       break;
> -               }
> -               print_clock("core", core_clock);
> -       } else if (IS_965(devid) && IS_MOBILE(devid)) {
> -               int render_clock = -1, sampler_clock = -1;
> +       engines->num_engines = 0;
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +       d = opendir(sysfs_root);
> +       if (!d)
> +               return NULL;
>
> -               switch (gcfgc & 0xf) {
> -               case 2:
> -                       render_clock = 250; sampler_clock = 267;
> -                       break;
> -               case 3:
> -                       render_clock = 320; sampler_clock = 333;
> -                       break;
> -               case 4:
> -                       render_clock = 400; sampler_clock = 444;
> -                       break;
> -               case 5:
> -                       render_clock = 500; sampler_clock = 533;
> +       while ((dent = readdir(d)) != NULL) {
> +               const char *endswith = "-busy";
> +               const unsigned int endlen = strlen(endswith);
> +               struct engine *engine =
> +                               engine_ptr(engines, engines->num_engines);
> +               char buf[256];
> +
> +               if (dent->d_type != DT_REG)
> +                       continue;
> +
> +               if (strlen(dent->d_name) >= sizeof(buf)) {
> +                       ret = ENAMETOOLONG;
>                         break;
>                 }
>
> -               print_clock("render", render_clock);
> -               printf("  ");
> -               print_clock("sampler", sampler_clock);
> -       } else if (IS_945(devid) && IS_MOBILE(devid)) {
> -               int render_clock = -1, display_clock = -1;
> +               strcpy(buf, dent->d_name);
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +               /* xxxN-busy */
> +               if (strlen(buf) < (endlen + 4))
> +                       continue;
> +               if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +                       continue;
>
> -               switch (gcfgc & 0x7) {
> -               case 0:
> -                       render_clock = 166;
> -                       break;
> -               case 1:
> -                       render_clock = 200;
> -                       break;
> -               case 3:
> -                       render_clock = 250;
> -                       break;
> -               case 5:
> -                       render_clock = 400;
> +               memset(engine, 0, sizeof(*engine));
> +
> +               buf[strlen(buf) - endlen] = 0;
> +               engine->name = strdup(buf);
> +               if (!engine->name) {
> +                       ret = errno;
>                         break;
>                 }
>
> -               switch (gcfgc & 0x70) {
> -               case 0:
> -                       display_clock = 200;
> -                       break;
> -               case 4:
> -                       display_clock = 320;
> +               engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +                                                    "busy");
> +               if (engine->busy.config == -1) {
> +                       ret = ENOENT;
>                         break;
>                 }
> -               if (gcfgc & (1 << 7))
> -                   display_clock = 133;
>
> -               print_clock("render", render_clock);
> -               printf("  ");
> -               print_clock("display", display_clock);
> -       } else if (IS_915(devid) && IS_MOBILE(devid)) {
> -               int render_clock = -1, display_clock = -1;
> +               engine->class = (engine->busy.config &
> +                                (__I915_PMU_OTHER(0) - 1)) >>
> +                               I915_PMU_CLASS_SHIFT;
>
> -               pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +               engine->instance = (engine->busy.config >>
> +                                   I915_PMU_SAMPLE_BITS) &
> +                                   ((1 << I915_PMU_SAMPLE_INSTANCE_BITS) - 1);
>
> -               switch (gcfgc & 0x7) {
> -               case 0:
> -                       render_clock = 160;
> -                       break;
> -               case 1:
> -                       render_clock = 190;
> -                       break;
> -               case 4:
> -                       render_clock = 333;
> +               ret = snprintf(buf, sizeof(buf), "%s/%u",
> +                              class_display_name(engine->class),
> +                              engine->instance);
> +               if (ret < 0 || ret == sizeof(buf)) {
> +                       ret = ENOBUFS;
>                         break;
>                 }
> -               if (gcfgc & (1 << 13))
> -                   render_clock = 133;
> +               ret = 0;
>
> -               switch (gcfgc & 0x70) {
> -               case 0:
> -                       display_clock = 190;
> +               engine->display_name = strdup(buf);
> +               if (!engine->display_name) {
> +                       ret = errno;
>                         break;
> -               case 4:
> -                       display_clock = 333;
> +               }
> +
> +               engines->num_engines++;
> +               engines = realloc(engines, sizeof(struct engines) +
> +                                 engines->num_engines * sizeof(struct engine));
> +               if (!engines) {
> +                       ret = errno;
>                         break;
>                 }
> -               if (gcfgc & (1 << 7))
> -                   display_clock = 133;
> +       }
> +
> +       if (ret) {
> +               free(engines);
> +               errno = ret;
>
> -               print_clock("render", render_clock);
> -               printf("  ");
> -               print_clock("display", display_clock);
> +               return NULL;
>         }
>
> +       qsort(engine_ptr(engines, 0), engines->num_engines,
> +             sizeof(struct engine), engine_cmp);
> +
> +       engines->root = d;
>
> -       printf("\n");
> -       return -1;
> +       return engines;
>  }
>
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END     (79 - STATS_LEN)
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +       int fd, err;
> +       ssize_t ret;
>
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +       fd = open(filename, O_RDONLY);
> +       if (fd < 0)
> +               return -1;
> +
> +       ret = read(fd, buf, bufsize - 1);
> +       err = errno;
> +       close(fd);
> +       if (ret < 1) {
> +               errno = ret < 0 ? err : ENOMSG;
> +
> +               return -1;
> +       }
> +
> +       if (ret > 1 && buf[ret - 1] == '\n')
> +               buf[ret - 1] = '\0';
> +       else
> +               buf[ret] = '\0';
> +
> +       return 0;
> +}
> +
> +static uint64_t filename_to_u64(const char *filename, int base)
>  {
> -       int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -       int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -       int i;
> +       char buf[64], *b;
>
> -       for (i = bar_len; i >= 8; i -= 8) {
> -               printf("%s", bars[8]);
> -               cur_line_len++;
> +       if (filename_to_buf(filename, buf, sizeof(buf)))
> +               return 0;
> +
> +       /*
> +        * Handle both single integer and key=value formats by skipping
> +        * leading non-digits.
> +        */
> +       b = buf;
> +       while (*b && !isdigit(*b))
> +               b++;
> +
> +       return strtoull(b, NULL, base);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +       char *oldlocale;
> +       char buf[80];
> +       double v;
> +
> +       if (filename_to_buf(filename, buf, sizeof(buf)))
> +               return 0;
> +
> +       oldlocale = setlocale(LC_ALL, "C");
> +       v = strtod(buf, NULL);
> +       setlocale(LC_ALL, oldlocale);
> +
> +       return v;
> +}
> +
> +#define RAPL_ROOT "/sys/devices/power/"
> +#define RAPL_EVENT "/sys/devices/power/events/"
> +
> +static uint64_t rapl_type_id(void)
> +{
> +       return filename_to_u64(RAPL_ROOT "type", 10);
> +}
> +
> +static uint64_t rapl_gpu_power(void)
> +{
> +       return filename_to_u64(RAPL_EVENT "energy-gpu", 0);
> +}
> +
> +static double rapl_gpu_power_scale(void)
> +{
> +       return filename_to_double(RAPL_EVENT "energy-gpu.scale");
> +}
> +
> +static const char *rapl_gpu_power_unit(void)
> +{
> +       char buf[32];
> +
> +       if (filename_to_buf(RAPL_EVENT "energy-gpu.unit",
> +                           buf, sizeof(buf)) == 0)
> +               if (!strcmp(buf, "Joules"))
> +                       return strdup("Watts");
> +               else
> +                       return strdup(buf);
> +       else
> +               return NULL;
> +}
> +
> +#define IMC_ROOT "/sys/devices/uncore_imc/"
> +#define IMC_EVENT "/sys/devices/uncore_imc/events/"
> +
> +static uint64_t imc_type_id(void)
> +{
> +       return filename_to_u64(IMC_ROOT "type", 10);
> +}
> +
> +static uint64_t imc_data_reads(void)
> +{
> +       return filename_to_u64(IMC_EVENT "data_reads", 0);
> +}
> +
> +static double imc_data_reads_scale(void)
> +{
> +       return filename_to_double(IMC_EVENT "data_reads.scale");
> +}
> +
> +static const char *imc_data_reads_unit(void)
> +{
> +       char buf[32];
> +
> +       if (filename_to_buf(IMC_EVENT "data_reads.unit", buf, sizeof(buf)) == 0)
> +               return strdup(buf);
> +       else
> +               return NULL;
> +}
> +
> +static uint64_t imc_data_writes(void)
> +{
> +       return filename_to_u64(IMC_EVENT "data_writes", 0);
> +}
> +
> +static double imc_data_writes_scale(void)
> +{
> +       return filename_to_double(IMC_EVENT "data_writes.scale");
> +}
> +
> +static const char *imc_data_writes_unit(void)
> +{
> +       char buf[32];
> +
> +       if (filename_to_buf(IMC_EVENT "data_writes.unit",
> +                           buf, sizeof(buf)) == 0)
> +               return strdup(buf);
> +       else
> +               return NULL;
> +}
> +
> +#define _open_pmu(cnt, pmu, fd) \
> +({ \
> +       int fd__; \
> +\
> +       fd__ = perf_i915_open_group((pmu)->config, (fd)); \
> +       if (fd__ >= 0) { \
> +               if ((fd) == -1) \
> +                       (fd) = fd__; \
> +               (pmu)->present = true; \
> +               (pmu)->idx = (cnt)++; \
> +       } \
> +\
> +       fd__; \
> +})
> +
> +#define _open_imc(cnt, pmu, fd) \
> +({ \
> +       int fd__; \
> +\
> +       fd__ = igt_perf_open_group(imc_type_id(), (pmu)->config, (fd)); \
> +       if (fd__ >= 0) { \
> +               if ((fd) == -1) \
> +                       (fd) = fd__; \
> +               (pmu)->present = true; \
> +               (pmu)->idx = (cnt)++; \
> +       } \
> +\
> +       fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
> +{
> +       unsigned int i;
> +       int fd;
> +
> +       engines->fd = -1;
> +       engines->num_counters = 0;
> +
> +       engines->irq.config = I915_PMU_INTERRUPTS;
> +       fd = _open_pmu(engines->num_counters, &engines->irq, engines->fd);
> +       if (fd < 0)
> +               return -1;
> +
> +       engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +       _open_pmu(engines->num_counters, &engines->freq_req, engines->fd);
> +
> +       engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +       _open_pmu(engines->num_counters, &engines->freq_act, engines->fd);
> +
> +       engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +       _open_pmu(engines->num_counters, &engines->rc6, engines->fd);
> +
> +       for (i = 0; i < engines->num_engines; i++) {
> +               struct engine *engine = engine_ptr(engines, i);
> +               struct {
> +                       struct pmu_counter *pmu;
> +                       const char *counter;
> +               } *cnt, counters[] = {
> +                       { .pmu = &engine->busy, .counter = "busy" },
> +                       { .pmu = &engine->wait, .counter = "wait" },
> +                       { .pmu = &engine->sema, .counter = "sema" },
> +                       { .pmu = NULL, .counter = NULL },
> +               };
> +
> +               for (cnt = counters; cnt->pmu; cnt++) {
> +                       if (!cnt->pmu->config)
> +                               cnt->pmu->config =
> +                                       get_pmu_config(dirfd(engines->root),
> +                                                      engine->name,
> +                                                      cnt->counter);
> +                       fd = _open_pmu(engines->num_counters, cnt->pmu,
> +                                      engines->fd);
> +                       if (fd >= 0)
> +                               engine->num_counters++;
> +               }
>         }
> -       if (i) {
> -               printf("%s", bars[i]);
> -               cur_line_len++;
> +
> +       engines->rapl_fd = -1;
> +       if (rapl_type_id()) {
> +               engines->rapl_scale = rapl_gpu_power_scale();
> +               engines->rapl_unit = rapl_gpu_power_unit();
> +               if (!engines->rapl_unit)
> +                       return -1;
> +
> +               engines->rapl.config = rapl_gpu_power();
> +               if (!engines->rapl.config)
> +                       return -1;
> +
> +               engines->rapl_fd = igt_perf_open(rapl_type_id(),
> +                                                engines->rapl.config);
> +               if (engines->rapl_fd < 0)
> +                       return -1;
> +
> +               engines->rapl.present = true;
>         }
>
> -       /* NB: We can't use a field width with utf8 so we manually
> -       * guarantee a field with of 45 chars for any bar. */
> -       printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> -}
> +       engines->imc_fd = -1;
> +       if (imc_type_id()) {
> +               unsigned int num = 0;
>
> -struct ring {
> -       const char *name;
> -       uint32_t mmio;
> -       int head, tail, size;
> -       uint64_t full;
> -       int idle;
> -};
> +               engines->imc_reads_scale = imc_data_reads_scale();
> +               engines->imc_writes_scale = imc_data_writes_scale();
> +
> +               engines->imc_reads_unit = imc_data_reads_unit();
> +               if (!engines->imc_reads_unit)
> +                       return -1;
> +
> +               engines->imc_writes_unit = imc_data_writes_unit();
> +               if (!engines->imc_writes_unit)
> +                       return -1;
> +
> +               engines->imc_reads.config = imc_data_reads();
> +               if (!engines->imc_reads.config)
> +                       return -1;
> +
> +               engines->imc_writes.config = imc_data_writes();
> +               if (!engines->imc_writes.config)
> +                       return -1;
> +
> +               fd = _open_imc(num, &engines->imc_reads, engines->imc_fd);
> +               if (fd < 0)
> +                       return -1;
> +               fd = _open_imc(num, &engines->imc_writes, engines->imc_fd);
> +               if (fd < 0)
> +                       return -1;
> +
> +               engines->imc_reads.present = true;
> +               engines->imc_writes.present = true;

We can drop these.

> +       }
> +
> +       return 0;
> +}
>
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>  {
> -       return INREG(ring->mmio + reg);
> +       uint64_t buf[2 + num];
> +       unsigned int i;
> +       ssize_t len;
> +
> +       memset(buf, 0, sizeof(buf));
> +
> +       len = read(fd, buf, sizeof(buf));
> +       assert(len == sizeof(buf));
> +
> +       for (i = 0; i < num; i++)
> +               val[i] = buf[2 + i];
> +
> +       return buf[1];
>  }
>
> -static void ring_init(struct ring *ring)
> +static double __pmu_calc(struct pmu_pair *p, double d, double t, double s)
>  {
> -       ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +       double v;
> +
> +       v = p->cur - p->prev;
> +       v /= d;
> +       v /= t;
> +       v *= s;
> +
> +       if (s == 100.0 && v > 100.0)
> +               v = 100.0;
> +
> +       return v;
>  }
>
> -static void ring_reset(struct ring *ring)
> +static void fill_str(char *buf, unsigned int bufsz, char c, unsigned int num)
>  {
> -       ring->idle = ring->full = 0;
> +       unsigned int i;
> +
> +       for (i = 0; i < num && i < (bufsz - 1); i++)
> +               *buf++ = c;
> +
> +       *buf = 0;
>  }
>
> -static void ring_sample(struct ring *ring)
> +static void pmu_calc(struct pmu_counter *cnt,
> +                    char *buf, unsigned int bufsz,
> +                    unsigned int width, unsigned width_dec,
> +                    double d, double t, double s)
>  {
> -       int full;
> +       double val;
> +       int len;
> +
> +       assert(bufsz >= (width + width_dec + 1));
> +
> +       if (!cnt->present) {
> +               fill_str(buf, bufsz, '-', width + width_dec);
> +               return;
> +       }
>
> -       if (!ring->size)
> +       val = __pmu_calc(&cnt->val, d, t, s);
> +
> +       len = snprintf(buf, bufsz, "%*.*f", width + width_dec, width_dec, val);
> +       if (len < 0 || len == bufsz) {
> +               fill_str(buf, bufsz, 'X', width + width_dec);
>                 return;
> +       }
> +}
> +
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
> +{
> +       uint64_t data[2] = { };
> +       ssize_t len;
>
> -       ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -       ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +       len = read(fd, data, sizeof(data));
> +       assert(len == sizeof(data));
>
> -       if (ring->tail == ring->head)
> -               ring->idle++;
> +       if (ts)
> +               *ts = data[1];
>
> -       full = ring->tail - ring->head;
> -       if (full < 0)
> -               full += ring->size;
> -       ring->full += full;
> +       return data[0];
>  }
>
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static uint64_t pmu_read_single(int fd)
>  {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +       return __pmu_read_single(fd, NULL);
>  }
>
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
>  {
> -       int percent_busy, len;
> +       counter->val.prev = counter->val.cur;
> +       counter->val.cur = val;
> +}
>
> -       if (!ring->size)
> -               return;
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
> +{
> +       if (counter->present)
> +               __update_sample(counter, val[counter->idx]);
> +}
> +
> +static void pmu_sample(struct engines *engines)
> +{
> +       const int num_val = engines->num_counters;
> +       uint64_t val[2 + num_val];
> +       unsigned int i;
> +
> +       engines->ts.prev = engines->ts.cur;
> +
> +       if (engines->rapl_fd >= 0)
> +               __update_sample(&engines->rapl,
> +                               pmu_read_single(engines->rapl_fd));
> +
> +       if (engines->imc_fd >= 0) {
> +               pmu_read_multi(engines->imc_fd, 2, val);
> +               update_sample(&engines->imc_reads, val);
> +               update_sample(&engines->imc_writes, val);
> +       }
>
> -       percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +       engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
>
> -       len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -       print_percentage_bar (percent_busy, len);
> -       printf("%24s space: %d/%d\n",
> -                  ring->name,
> -                  (int)(ring->full / samples_per_sec),
> -                  ring->size);
> +       update_sample(&engines->freq_req, val);
> +       update_sample(&engines->freq_act, val);
> +       update_sample(&engines->irq, val);
> +       update_sample(&engines->rc6, val);
> +
> +       for (i = 0; i < engines->num_engines; i++) {
> +               struct engine *engine = engine_ptr(engines, i);
> +
> +               update_sample(&engine->busy, val);
> +               update_sample(&engine->sema, val);
> +               update_sample(&engine->wait, val);
> +       }
>  }
>
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -               FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>  {
> -       if (ring->size)
> -               fprintf(output, "%3d\t%d\t",
> -                       (int)(100 - 100 * ring->idle / samples_per_sec),
> -                       (int)(ring->full / samples_per_sec));
> -       else
> -               fprintf(output, "-1\t-1\t");
> +       int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +       int i;
> +
> +       putchar('|');
> +
> +       for (i = bar_len; i >= 8; i -= 8)
> +               printf("%s", bars[8]);
> +       if (i)
> +               printf("%s", bars[i]);
> +
> +       for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +               putchar(' ');
> +
> +       putchar('|');
>  }
>
> +#define DEFAULT_PERIOD_MS (1000)
> +
>  static void
>  usage(const char *appname)
>  {
>         printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -                       "\n"
> -                       "usage: %s [parameters]\n"
> -                       "\n"
> -                       "The following parameters apply:\n"
> -                       "[-s <samples>]       samples per seconds (default %d)\n"
> -                       "[-e <command>]       command to profile\n"
> -                       "[-o <file>]          output statistics to file. If file is '-',"
> -                       "                     run in batch mode and output statistics to stdio only \n"
> -                       "[-h]                 show this help screen\n"
> -                       "\n",
> -                       appname,
> -                       SAMPLES_PER_SEC
> -                 );
> -       return;
> +               "\n"
> +               "Usage: %s [parameters]\n"
> +               "\n"
> +               "\tThe following parameters are optional:\n\n"
> +               "\t[-s <ms>]       Refresh period in milliseconds (default %ums).\n"
> +               "\t[-h]            Show this help text.\n"
> +               "\n",
> +               appname, DEFAULT_PERIOD_MS);
>  }
>
>  int main(int argc, char **argv)
>  {
> -       uint32_t devid;
> -       struct pci_device *pci_dev;
> -       struct ring render_ring = {
> -               .name = "render",
> -               .mmio = 0x2030,
> -       }, bsd_ring = {
> -               .name = "bitstream",
> -               .mmio = 0x4030,
> -       }, bsd6_ring = {
> -               .name = "bitstream",
> -               .mmio = 0x12030,
> -       }, blt_ring = {
> -               .name = "blitter",
> -               .mmio = 0x22030,
> -       };
> -       int i, ch;
> -       int samples_per_sec = SAMPLES_PER_SEC;
> -       FILE *output = NULL;
> -       double elapsed_time=0;
> -       int print_headers=1;
> -       pid_t child_pid=-1;
> -       int child_stat;
> -       char *cmd=NULL;
> -       int interactive=1;
> -
> -       /* Parse options? */
> -       while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +       unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +       int con_w = -1, con_h = -1;
> +       struct engines *engines;
> +       unsigned int i;
> +       int ret, ch;
> +
> +       /* Parse options */
> +       while ((ch = getopt(argc, argv, "s:h")) != -1) {
>                 switch (ch) {
> -               case 'e': cmd = strdup(optarg);
> -                       break;
> -               case 's': samples_per_sec = atoi(optarg);
> -                       if (samples_per_sec < 100) {
> -                               fprintf(stderr, "Error: samples per second must be >= 100\n");
> -                               exit(1);
> -                       }
> -                       break;
> -               case 'o':
> -                       if (!strcmp(optarg, "-")) {
> -                               /* Running in non-interactive mode */
> -                               interactive = 0;
> -                               output = stdout;
> -                       }
> -                       else
> -                               output = fopen(optarg, "w");
> -                       if (!output)
> -                       {
> -                               perror("fopen");
> -                               exit(1);
> -                       }
> +               case 's':
> +                       period_us = atoi(optarg) * 1000;
>                         break;
>                 case 'h':
>                         usage(argv[0]);
>                         exit(0);
> -                       break;
>                 default:
> -                       fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +                       fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>                         usage(argv[0]);
>                         exit(1);
> -                       break;
>                 }
>         }
>
> -       pci_dev = intel_get_pci_device();
> -       devid = pci_dev->device_id;
> -       intel_mmio_use_pci_bar(pci_dev);
> -       init_instdone_definitions(devid);
> -
> -       /* Do we have a command to run? */
> -       if (cmd != NULL) {
> -               if (output) {
> -                       fprintf(output, "# Profiling: %s\n", cmd);
> -                       fflush(output);
> -               }
> -               child_pid = fork();
> -               if (child_pid < 0) {
> -                       perror("fork");
> -                       exit(1);
> -               }
> -               else if (child_pid == 0) {
> -                       int res;
> -                       res = system(cmd);
> -                       if (res < 0)
> -                               perror("running command");
> -                       if (output) {
> -                               fflush(output);
> -                               fprintf(output, "# %s exited with status %d\n", cmd, res);
> -                               fflush(output);
> -                       }
> -                       free(cmd);
> -                       exit(0);
> -               } else {
> -                       free(cmd);
> -               }
> +       engines = discover_engines();
> +       if (!engines) {
> +               fprintf(stderr,
> +                       "Failed to detect engines! (%s)\n(Kernel 4.16 or newer is required for i915 PMU support.)\n",
> +                       strerror(errno));
> +               return 1;
>         }
>
> -       for (i = 0; i < num_instdone_bits; i++) {
> -               top_bits[i].bit = &instdone_bits[i];
> -               top_bits[i].count = 0;
> -               top_bits_sorted[i] = &top_bits[i];
> +       ret = pmu_init(engines);
> +       if (ret) {
> +               fprintf(stderr,
> +                       "Failed to initialize PMU! (%s)\n", strerror(errno));
> +               return 1;
>         }
>
> -       /* Grab access to the registers */
> -       intel_register_access_init(pci_dev, 0, -1);
> +       pmu_sample(engines);
>
> -       ring_init(&render_ring);
> -       if (IS_GEN4(devid) || IS_GEN5(devid))
> -               ring_init(&bsd_ring);
> -       if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -               ring_init(&bsd6_ring);
> -               ring_init(&blt_ring);
> -       }
> +       for (;;) {
> +               double t;
> +#define BUFSZ 16
> +               char freq[BUFSZ];
> +               char fact[BUFSZ];
> +               char irq[BUFSZ];
> +               char rc6[BUFSZ];
> +               char power[BUFSZ];
> +               char reads[BUFSZ];
> +               char writes[BUFSZ];
> +               struct winsize ws;
> +               int lines = 0;
>
> -       /* Initialize GPU stats */
> -       if (HAS_STATS_REGS(devid)) {
> -               for (i = 0; i < STATS_COUNT; i++) {
> -                       uint32_t stats_high, stats_low, stats_high_2;
> +               /* Update terminal size. */
> +               if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +                       con_w = ws.ws_col;
> +                       con_h = ws.ws_row;
> +               }
>
> -                       do {
> -                               stats_high = INREG(stats_regs[i] + 4);
> -                               stats_low = INREG(stats_regs[i]);
> -                               stats_high_2 = INREG(stats_regs[i] + 4);
> -                       } while (stats_high != stats_high_2);
> +               pmu_sample(engines);
> +               t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>
> -                       last_stats[i] = (uint64_t)stats_high << 32 |
> -                               stats_low;
> -               }
> -       }
> +               printf("\033[H\033[J");
>
> -       for (;;) {
> -               int j;
> -               unsigned long long t1, ti, tf, t2;
> -               unsigned long long def_sleep = 1000000 / samples_per_sec;
> -               unsigned long long last_samples_per_sec = samples_per_sec;
> -               unsigned short int max_lines;
> -               struct winsize ws;
> -               char clear_screen[] = {0x1b, '[', 'H',
> -                                      0x1b, '[', 'J',
> -                                      0x0};
> -               int percent;
> -               int len;
> -
> -               t1 = gettime();
> -
> -               ring_reset(&render_ring);
> -               ring_reset(&bsd_ring);
> -               ring_reset(&bsd6_ring);
> -               ring_reset(&blt_ring);
> -
> -               for (i = 0; i < samples_per_sec; i++) {
> -                       long long interval;
> -                       ti = gettime();
> -                       if (IS_965(devid)) {
> -                               instdone = INREG(INSTDONE_I965);
> -                               instdone1 = INREG(INSTDONE_1);
> -                       } else
> -                               instdone = INREG(INSTDONE);
> -
> -                       for (j = 0; j < num_instdone_bits; j++)
> -                               update_idle_bit(&top_bits[j]);
> -
> -                       ring_sample(&render_ring);
> -                       ring_sample(&bsd_ring);
> -                       ring_sample(&bsd6_ring);
> -                       ring_sample(&blt_ring);
> -
> -                       tf = gettime();
> -                       if (tf - t1 >= 1000000) {
> -                               /* We are out of sync, bail out */
> -                               last_samples_per_sec = i+1;
> -                               break;
> -                       }
> -                       interval = def_sleep - (tf - ti);
> -                       if (interval > 0)
> -                               usleep(interval);
> -               }
> +               pmu_calc(&engines->freq_req, freq, BUFSZ, 4, 0, 1.0, t, 1);
> +               pmu_calc(&engines->freq_act, fact, BUFSZ, 4, 0, 1.0, t, 1);
> +               pmu_calc(&engines->irq, irq, BUFSZ, 8, 0, 1.0, t, 1);
> +               pmu_calc(&engines->rc6, rc6, BUFSZ, 3, 0, 1e9, t, 100);
> +               pmu_calc(&engines->rapl, power, BUFSZ, 4, 2, 1.0, t,
> +                        engines->rapl_scale);
> +               pmu_calc(&engines->imc_reads, reads, BUFSZ, 6, 0, 1.0, t,
> +                        engines->imc_reads_scale);
> +               pmu_calc(&engines->imc_writes, writes, BUFSZ, 6, 0, 1.0, t,
> +                        engines->imc_writes_scale);
>
> -               if (HAS_STATS_REGS(devid)) {
> -                       for (i = 0; i < STATS_COUNT; i++) {
> -                               uint32_t stats_high, stats_low, stats_high_2;
> +               if (lines++ < con_h)
> +                       printf("intel-gpu-top - %s/%s MHz;  %s%% RC6; %s %s; %s irqs/s\n",
> +                              fact, freq, rc6, power, engines->rapl_unit, irq);
>
> -                               do {
> -                                       stats_high = INREG(stats_regs[i] + 4);
> -                                       stats_low = INREG(stats_regs[i]);
> -                                       stats_high_2 = INREG(stats_regs[i] + 4);
> -                               } while (stats_high != stats_high_2);
> +               if (lines++ < con_h)
> +                       printf("\n");
>
> -                               stats[i] = (uint64_t)stats_high << 32 |
> -                                       stats_low;
> -                       }
> -               }
> +               if (engines->imc_fd) {
> +                       if (lines++ < con_h)
> +                               printf("      IMC reads:   %s %s/s\n",
> +                                      reads, engines->imc_reads_unit);
> +
> +                       if (lines++ < con_h)
> +                               printf("     IMC writes:   %s %s/s\n",
> +                                      writes, engines->imc_writes_unit);
>
> -               qsort(top_bits_sorted, num_instdone_bits,
> -                     sizeof(struct top_bit *), top_bits_sort);
> -
> -               /* Limit the number of lines printed to the terminal height so the
> -                * most important info (at the top) will stay on screen. */
> -               max_lines = -1;
> -               if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -                       max_lines = ws.ws_row - 6; /* exclude header lines */
> -               if (max_lines >= num_instdone_bits)
> -                       max_lines = num_instdone_bits;
> -
> -               t2 = gettime();
> -               elapsed_time += (t2 - t1) / 1000000.0;
> -
> -               if (interactive) {
> -                       printf("%s", clear_screen);
> -                       print_clock_info(pci_dev);
> -
> -                       ring_print(&render_ring, last_samples_per_sec);
> -                       ring_print(&bsd_ring, last_samples_per_sec);
> -                       ring_print(&bsd6_ring, last_samples_per_sec);
> -                       ring_print(&blt_ring, last_samples_per_sec);
> -
> -                       printf("\n%30s  %s\n", "task", "percent busy");
> -                       for (i = 0; i < max_lines; i++) {
> -                               if (top_bits_sorted[i]->count > 0) {
> -                                       percent = (top_bits_sorted[i]->count * 100) /
> -                                               last_samples_per_sec;
> -                                       len = printf("%30s: %3d%%: ",
> -                                                        top_bits_sorted[i]->bit->name,
> -                                                        percent);
> -                                       print_percentage_bar (percent, len);
> -                               } else {
> -                                       printf("%*s", PERCENTAGE_BAR_END, "");
> -                               }
> -
> -                               if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -                                       printf("%13s: %llu (%lld/sec)",
> -                                                  stats_reg_names[i],
> -                                                  (long long)stats[i],
> -                                                  (long long)(stats[i] - last_stats[i]));
> -                                       last_stats[i] = stats[i];
> -                               } else {
> -                                       if (!top_bits_sorted[i]->count)
> -                                               break;
> -                               }
> +                       if (++lines < con_h)
>                                 printf("\n");
> -                       }
>                 }
> -               if (output) {
> -                       /* Print headers for columns at first run */
> -                       if (print_headers) {
> -                               fprintf(output, "# time\t");
> -                               ring_print_header(output, &render_ring);
> -                               ring_print_header(output, &bsd_ring);
> -                               ring_print_header(output, &bsd6_ring);
> -                               ring_print_header(output, &blt_ring);
> -                               for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -                                       if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -                                               fprintf(output, "%.6s\t",
> -                                                          stats_reg_names[i]
> -                                                          );
> -                                       }
> -                                       if (!top_bits[i].count)
> -                                               continue;
> -                               }
> -                               fprintf(output, "\n");
> -                               print_headers = 0;
> -                       }
>
> -                       /* Print statistics */
> -                       fprintf(output, "%.2f\t", elapsed_time);
> -                       ring_log(&render_ring, last_samples_per_sec, output);
> -                       ring_log(&bsd_ring, last_samples_per_sec, output);
> -                       ring_log(&bsd6_ring, last_samples_per_sec, output);
> -                       ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -                       for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -                               if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -                                       fprintf(output, "%"PRIu64"\t",
> -                                                  stats[i] - last_stats[i]);
> -                                       last_stats[i] = stats[i];
> -                               }
> -                                       if (!top_bits[i].count)
> -                                               continue;
> -                       }
> -                       fprintf(output, "\n");
> -                       fflush(output);
> -               }
> +               for (i = 0; i < engines->num_engines; i++) {
> +                       struct engine *engine = engine_ptr(engines, i);
>
> -               for (i = 0; i < num_instdone_bits; i++) {
> -                       top_bits_sorted[i]->count = 0;
> +                       if (engine->num_counters && lines < con_h) {
> +                               const char *a = "          ENGINE      BUSY ";
> +                               const char *b = " MI_SEMA MI_WAIT";
>
> -                       if (i < STATS_COUNT)
> -                               last_stats[i] = stats[i];
> +                               printf("\033[7m%s%*s%s\033[0m\n",
> +                                      a,
> +                                      (int)(con_w - 1 - strlen(a) - strlen(b)),
> +                                      " ", b);
> +                               lines++;
> +                               break;
> +                       }
>                 }
>
> -               /* Check if child has gone */
> -               if (child_pid > 0) {
> -                       int res;
> -                       if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -                               perror("waitpid");
> -                               exit(1);
> -                       }
> -                       if (res == 0)
> +               for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +                       struct engine *engine = engine_ptr(engines, i);
> +                       unsigned int max_w = con_w - 1;
> +                       unsigned int len;
> +                       char sema[BUFSZ];
> +                       char wait[BUFSZ];
> +                       char busy[BUFSZ];
> +                       char buf[128];
> +                       double val;
> +
> +                       if (!engine->num_counters)
>                                 continue;
> -                       if (WIFEXITED(child_stat))
> -                               break;
> +
> +                       pmu_calc(&engine->sema, sema, BUFSZ, 3, 0, 1e9, t, 100);
> +                       pmu_calc(&engine->wait, wait, BUFSZ, 3, 0, 1e9, t, 100);
> +                       len = snprintf(buf, sizeof(buf), "    %s%%    %s%%",
> +                                      sema, wait);
> +
> +                       pmu_calc(&engine->busy, busy, BUFSZ, 6, 2, 1e9, t,
> +                                100);
> +                       len += printf("%16s %s%% ", engine->display_name, busy);
> +
> +                       val = __pmu_calc(&engine->busy.val, 1e9, t, 100);
> +                       print_percentage_bar(val, max_w - len);

if (max_w > len + 2)
     print_percentage_bar(val, max_w - len);

or something? Meh.

fwiw,
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2018-05-29  9:58 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-28 18:29 [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use Tvrtko Ursulin
2018-03-28 18:29 ` [Intel-gfx] " Tvrtko Ursulin
2018-03-28 18:35 ` Chris Wilson
2018-03-28 18:35   ` [igt-dev] " Chris Wilson
2018-03-28 18:56 ` Lionel Landwerlin
2018-03-28 18:56   ` [igt-dev] " Lionel Landwerlin
2018-03-28 20:11 ` Rinat Ibragimov
2018-03-29 10:49   ` Tvrtko Ursulin
2018-03-29 10:49     ` [Intel-gfx] " Tvrtko Ursulin
2018-03-29  0:40 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
2018-03-29  8:20 ` [RFC i-g-t] " Petri Latvala
2018-03-29  8:20   ` [Intel-gfx] " Petri Latvala
2018-03-29  9:46 ` [igt-dev] ✓ Fi.CI.IGT: success for " Patchwork
2018-03-29 10:33 ` [PATCH i-g-t v2] " Tvrtko Ursulin
2018-03-29 10:33   ` [igt-dev] " Tvrtko Ursulin
2018-03-29 14:30   ` Eero Tamminen
2018-03-29 14:30     ` [Intel-gfx] " Eero Tamminen
2018-03-29 18:46     ` [PATCH i-g-t v3] " Tvrtko Ursulin
2018-03-29 18:46       ` [igt-dev] " Tvrtko Ursulin
2018-03-30 19:15       ` Rinat Ibragimov
2018-04-03  9:14         ` [PATCH i-g-t v4] " Tvrtko Ursulin
2018-04-03  9:14           ` [Intel-gfx] " Tvrtko Ursulin
2018-04-03  9:38         ` [igt-dev] [PATCH i-g-t v3] " Tvrtko Ursulin
2018-04-03  9:38           ` Tvrtko Ursulin
2018-03-29 19:10     ` [igt-dev] ✓ Fi.CI.BAT: success for intel-gpu-top: Rewrite the tool to be safe to use (rev3) Patchwork
2018-03-29 23:29     ` [igt-dev] ✗ Fi.CI.IGT: warning " Patchwork
2018-04-03  9:36     ` [igt-dev] [PATCH i-g-t v2] intel-gpu-top: Rewrite the tool to be safe to use Tvrtko Ursulin
2018-04-03  9:36       ` [Intel-gfx] " Tvrtko Ursulin
2018-04-03 14:06       ` Eero Tamminen
2018-04-03 14:06         ` [Intel-gfx] " Eero Tamminen
2018-04-03 17:18         ` Tvrtko Ursulin
2018-04-03 17:18           ` Tvrtko Ursulin
2018-04-04 12:15           ` Eero Tamminen
2018-04-04 12:15             ` [Intel-gfx] " Eero Tamminen
2018-04-04 12:42             ` Tvrtko Ursulin
2018-04-04 12:42               ` [Intel-gfx] " Tvrtko Ursulin
2018-04-04 14:23               ` Eero Tamminen
2018-04-04 14:23                 ` Eero Tamminen
2018-04-04 15:24                 ` Tvrtko Ursulin
2018-04-04 15:24                   ` Tvrtko Ursulin
2018-03-29 16:27   ` Chris Wilson
2018-03-29 16:27     ` [igt-dev] " Chris Wilson
2018-03-29 15:59 ` [igt-dev] ✓ Fi.CI.BAT: success for intel-gpu-top: Rewrite the tool to be safe to use (rev2) Patchwork
2018-03-29 20:23 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-04-04  9:48 ` [PATCH i-g-t v5] intel-gpu-top: Rewrite the tool to be safe to use Tvrtko Ursulin
2018-04-04  9:48   ` [igt-dev] " Tvrtko Ursulin
2018-04-04 12:48   ` Rinat Ibragimov
2018-04-04 15:26   ` [PATCH i-g-t v6] " Tvrtko Ursulin
2018-04-04 15:26     ` [igt-dev] " Tvrtko Ursulin
2018-04-09 12:26     ` Tvrtko Ursulin
2018-04-09 12:26       ` Tvrtko Ursulin
2018-04-23 12:18       ` Rinat Ibragimov
2018-05-29  9:58     ` Matthew Auld
2018-05-29  9:58       ` Matthew Auld
2018-04-04 13:27 ` [igt-dev] ✓ Fi.CI.BAT: success for intel-gpu-top: Rewrite the tool to be safe to use (rev4) Patchwork
2018-04-04 15:53 ` [igt-dev] ✗ Fi.CI.BAT: failure for intel-gpu-top: Rewrite the tool to be safe to use (rev5) Patchwork
2018-04-04 16:56 ` [igt-dev] ✗ Fi.CI.IGT: warning for intel-gpu-top: Rewrite the tool to be safe to use (rev4) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.