All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH i-g-t v2 0/7] IGT PMU support
@ 2017-09-25 15:14 Tvrtko Ursulin
  2017-09-25 15:14 ` [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library Tvrtko Ursulin
                   ` (8 more replies)
  0 siblings, 9 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:14 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

1.
Fixes for intel-gpu-overlay to work on top of the proposed i915 PMU perf API.

2.
New test to exercise the same API.

3.
Update to gem_wsim and media-bench.pl to be able to use engine busyness via PMU
for making balancing decisions.

v2:
 * Added gem_wsim and media-bench.pl patches.
 * Comments and fixes for the perf_pmu test.

Tvrtko Ursulin (7):
  intel-gpu-overlay: Move local perf implementation to a library
  intel-gpu-overlay: Consolidate perf PMU access to library
  intel-gpu-overlay: Fix interrupts PMU readout
  intel-gpu-overlay: Catch-up to new i915 PMU
  tests/perf_pmu: Tests for i915 PMU API
  gem_wsim: Busy stats balancers
  media-bench.pl: Add busy balancers to the list

 benchmarks/gem_wsim.c    | 140 ++++++++
 lib/Makefile.sources     |   2 +
 lib/igt_gt.c             |  50 +++
 lib/igt_gt.h             |  38 +++
 lib/igt_perf.c           |  59 ++++
 lib/igt_perf.h           | 100 ++++++
 overlay/Makefile.am      |   6 +-
 overlay/gem-interrupts.c |  25 +-
 overlay/gpu-freq.c       |  25 +-
 overlay/gpu-perf.c       |   3 +-
 overlay/gpu-top.c        |  87 +++--
 overlay/perf.c           |  26 --
 overlay/perf.h           |  64 ----
 overlay/power.c          |  22 +-
 overlay/rc6.c            |  27 +-
 scripts/media-bench.pl   |   5 +-
 tests/Makefile.sources   |   1 +
 tests/perf_pmu.c         | 869 +++++++++++++++++++++++++++++++++++++++++++++++
 18 files changed, 1326 insertions(+), 223 deletions(-)
 create mode 100644 lib/igt_perf.c
 create mode 100644 lib/igt_perf.h
 delete mode 100644 overlay/perf.c
 delete mode 100644 overlay/perf.h
 create mode 100644 tests/perf_pmu.c

-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
@ 2017-09-25 15:14 ` Tvrtko Ursulin
  2017-09-25 15:22   ` Chris Wilson
  2017-09-25 15:14 ` [PATCH i-g-t 2/7] intel-gpu-overlay: Consolidate perf PMU access to library Tvrtko Ursulin
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:14 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 lib/Makefile.sources             | 2 ++
 overlay/perf.c => lib/igt_perf.c | 2 +-
 overlay/perf.h => lib/igt_perf.h | 2 ++
 overlay/Makefile.am              | 6 ++----
 overlay/gem-interrupts.c         | 3 ++-
 overlay/gpu-freq.c               | 3 ++-
 overlay/gpu-perf.c               | 3 ++-
 overlay/gpu-top.c                | 3 ++-
 overlay/power.c                  | 3 ++-
 overlay/rc6.c                    | 3 ++-
 10 files changed, 19 insertions(+), 11 deletions(-)
 rename overlay/perf.c => lib/igt_perf.c (94%)
 rename overlay/perf.h => lib/igt_perf.h (99%)

diff --git a/lib/Makefile.sources b/lib/Makefile.sources
index 53fdb54cbfa5..c031cb502469 100644
--- a/lib/Makefile.sources
+++ b/lib/Makefile.sources
@@ -16,6 +16,8 @@ lib_source_list =	 	\
 	igt_gt.h		\
 	igt_gvt.c		\
 	igt_gvt.h		\
+	igt_perf.c		\
+	igt_perf.h		\
 	igt_primes.c		\
 	igt_primes.h		\
 	igt_rand.c		\
diff --git a/overlay/perf.c b/lib/igt_perf.c
similarity index 94%
rename from overlay/perf.c
rename to lib/igt_perf.c
index b8fdc675c587..45cccff0ae53 100644
--- a/overlay/perf.c
+++ b/lib/igt_perf.c
@@ -3,7 +3,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 
-#include "perf.h"
+#include "igt_perf.h"
 
 uint64_t i915_type_id(void)
 {
diff --git a/overlay/perf.h b/lib/igt_perf.h
similarity index 99%
rename from overlay/perf.h
rename to lib/igt_perf.h
index c44e65f9734c..a80b311cd1d1 100644
--- a/overlay/perf.h
+++ b/lib/igt_perf.h
@@ -1,6 +1,8 @@
 #ifndef I915_PERF_H
 #define I915_PERF_H
 
+#include <stdint.h>
+
 #include <linux/perf_event.h>
 
 #define I915_SAMPLE_BUSY	0
diff --git a/overlay/Makefile.am b/overlay/Makefile.am
index 5472514efc16..c66a80f4e571 100644
--- a/overlay/Makefile.am
+++ b/overlay/Makefile.am
@@ -4,8 +4,8 @@ endif
 
 AM_CPPFLAGS = -I.
 AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) \
-	$(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS)
-LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS)
+	$(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS) -I$(srcdir)/../lib
+LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS) $(top_builddir)/lib/libintel_tools.la
 
 intel_gpu_overlay_SOURCES = \
 	chart.h \
@@ -29,8 +29,6 @@ intel_gpu_overlay_SOURCES = \
 	igfx.c \
 	overlay.h \
 	overlay.c \
-	perf.h \
-	perf.c \
 	power.h \
 	power.c \
 	rc6.h \
diff --git a/overlay/gem-interrupts.c b/overlay/gem-interrupts.c
index 0150a1d03825..7ba54fcd487d 100644
--- a/overlay/gem-interrupts.c
+++ b/overlay/gem-interrupts.c
@@ -31,9 +31,10 @@
 #include <string.h>
 #include <ctype.h>
 
+#include "igt_perf.h"
+
 #include "gem-interrupts.h"
 #include "debugfs.h"
-#include "perf.h"
 
 static int perf_open(void)
 {
diff --git a/overlay/gpu-freq.c b/overlay/gpu-freq.c
index 321c93882238..7f29b1aa986e 100644
--- a/overlay/gpu-freq.c
+++ b/overlay/gpu-freq.c
@@ -28,9 +28,10 @@
 #include <string.h>
 #include <stdio.h>
 
+#include "igt_perf.h"
+
 #include "gpu-freq.h"
 #include "debugfs.h"
-#include "perf.h"
 
 static int perf_i915_open(int config, int group)
 {
diff --git a/overlay/gpu-perf.c b/overlay/gpu-perf.c
index f557b9f06a17..3d4a9be91a94 100644
--- a/overlay/gpu-perf.c
+++ b/overlay/gpu-perf.c
@@ -34,7 +34,8 @@
 #include <fcntl.h>
 #include <errno.h>
 
-#include "perf.h"
+#include "igt_perf.h"
+
 #include "gpu-perf.h"
 #include "debugfs.h"
 
diff --git a/overlay/gpu-top.c b/overlay/gpu-top.c
index 891a7ea7c0b1..06f489dfdc83 100644
--- a/overlay/gpu-top.c
+++ b/overlay/gpu-top.c
@@ -31,7 +31,8 @@
 #include <errno.h>
 #include <assert.h>
 
-#include "perf.h"
+#include "igt_perf.h"
+
 #include "igfx.h"
 #include "gpu-top.h"
 
diff --git a/overlay/power.c b/overlay/power.c
index 2f1521b82cd6..84d860cae40c 100644
--- a/overlay/power.c
+++ b/overlay/power.c
@@ -31,7 +31,8 @@
 #include <time.h>
 #include <errno.h>
 
-#include "perf.h"
+#include "igt_perf.h"
+
 #include "power.h"
 #include "debugfs.h"
 
diff --git a/overlay/rc6.c b/overlay/rc6.c
index d7047c2f4880..3175bb22308f 100644
--- a/overlay/rc6.c
+++ b/overlay/rc6.c
@@ -31,8 +31,9 @@
 #include <time.h>
 #include <errno.h>
 
+#include "igt_perf.h"
+
 #include "rc6.h"
-#include "perf.h"
 
 static int perf_i915_open(int config, int group)
 {
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH i-g-t 2/7] intel-gpu-overlay: Consolidate perf PMU access to library
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
  2017-09-25 15:14 ` [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library Tvrtko Ursulin
@ 2017-09-25 15:14 ` Tvrtko Ursulin
  2017-09-25 15:25   ` Chris Wilson
  2017-09-25 15:14 ` [PATCH i-g-t 3/7] intel-gpu-overlay: Fix interrupts PMU readout Tvrtko Ursulin
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:14 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 lib/igt_perf.c           | 33 +++++++++++++++++++++++++++++++++
 lib/igt_perf.h           |  2 ++
 overlay/gem-interrupts.c | 16 +---------------
 overlay/gpu-freq.c       | 22 ++--------------------
 overlay/gpu-top.c        | 32 ++++++++------------------------
 overlay/power.c          | 17 +----------------
 overlay/rc6.c            | 24 +++---------------------
 7 files changed, 50 insertions(+), 96 deletions(-)

diff --git a/lib/igt_perf.c b/lib/igt_perf.c
index 45cccff0ae53..0fa5ae3acb66 100644
--- a/lib/igt_perf.c
+++ b/lib/igt_perf.c
@@ -2,6 +2,8 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
+#include <string.h>
+#include <errno.h>
 
 #include "igt_perf.h"
 
@@ -24,3 +26,34 @@ uint64_t i915_type_id(void)
 	return strtoull(buf, 0, 0);
 }
 
+static int _perf_open(int config, int group, int format)
+{
+	struct perf_event_attr attr;
+
+	memset(&attr, 0, sizeof (attr));
+
+	attr.type = i915_type_id();
+	if (attr.type == 0)
+		return -ENOENT;
+
+	attr.config = config;
+
+	if (group >= 0)
+		format &= ~PERF_FORMAT_GROUP;
+
+	attr.read_format = format;
+
+	return perf_event_open(&attr, -1, 0, group, 0);
+
+}
+
+int perf_i915_open(int config)
+{
+	return _perf_open(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED);
+}
+
+int perf_i915_open_group(int config, int group)
+{
+	return _perf_open(config, group,
+			  PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
+}
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index a80b311cd1d1..8e674c3a3755 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -62,5 +62,7 @@ perf_event_open(struct perf_event_attr *attr,
 }
 
 uint64_t i915_type_id(void);
+int perf_i915_open(int config);
+int perf_i915_open_group(int config, int group);
 
 #endif /* I915_PERF_H */
diff --git a/overlay/gem-interrupts.c b/overlay/gem-interrupts.c
index 7ba54fcd487d..a84aef0398a7 100644
--- a/overlay/gem-interrupts.c
+++ b/overlay/gem-interrupts.c
@@ -36,20 +36,6 @@
 #include "gem-interrupts.h"
 #include "debugfs.h"
 
-static int perf_open(void)
-{
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof (attr));
-
-	attr.type = i915_type_id();
-	if (attr.type == 0)
-		return -ENOENT;
-	attr.config = I915_PERF_INTERRUPTS;
-
-	return perf_event_open(&attr, -1, 0, -1, 0);
-}
-
 static long long debugfs_read(void)
 {
 	char buf[8192], *b;
@@ -127,7 +113,7 @@ int gem_interrupts_init(struct gem_interrupts *irqs)
 {
 	memset(irqs, 0, sizeof(*irqs));
 
-	irqs->fd = perf_open();
+	irqs->fd = perf_i915_open(I915_PERF_INTERRUPTS);
 	if (irqs->fd < 0 && interrupts_read() < 0)
 		irqs->error = ENODEV;
 
diff --git a/overlay/gpu-freq.c b/overlay/gpu-freq.c
index 7f29b1aa986e..76c5ed9acfd1 100644
--- a/overlay/gpu-freq.c
+++ b/overlay/gpu-freq.c
@@ -33,30 +33,12 @@
 #include "gpu-freq.h"
 #include "debugfs.h"
 
-static int perf_i915_open(int config, int group)
-{
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof (attr));
-
-	attr.type = i915_type_id();
-	if (attr.type == 0)
-		return -ENOENT;
-	attr.config = config;
-
-	attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
-	if (group == -1)
-		attr.read_format |= PERF_FORMAT_GROUP;
-
-	return perf_event_open(&attr, -1, 0, group, 0);
-}
-
 static int perf_open(void)
 {
 	int fd;
 
-	fd = perf_i915_open(I915_PERF_ACTUAL_FREQUENCY, -1);
-	if (perf_i915_open(I915_PERF_REQUESTED_FREQUENCY, fd) < 0) {
+	fd = perf_i915_open_group(I915_PERF_ACTUAL_FREQUENCY, -1);
+	if (perf_i915_open_group(I915_PERF_REQUESTED_FREQUENCY, fd) < 0) {
 		close(fd);
 		fd = -1;
 	}
diff --git a/overlay/gpu-top.c b/overlay/gpu-top.c
index 06f489dfdc83..812f47d5aced 100644
--- a/overlay/gpu-top.c
+++ b/overlay/gpu-top.c
@@ -48,24 +48,6 @@
 #define I915_PERF_RING_WAIT(n) (__I915_PERF_RING(n) + 1)
 #define I915_PERF_RING_SEMA(n) (__I915_PERF_RING(n) + 2)
 
-static int perf_i915_open(int config, int group)
-{
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof (attr));
-
-	attr.type = i915_type_id();
-	if (attr.type == 0)
-		return -ENOENT;
-	attr.config = config;
-
-	attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
-	if (group == -1)
-		attr.read_format |= PERF_FORMAT_GROUP;
-
-	return perf_event_open(&attr, -1, 0, group, 0);
-}
-
 static int perf_init(struct gpu_top *gt)
 {
 	const char *names[] = {
@@ -77,27 +59,29 @@ static int perf_init(struct gpu_top *gt)
 	};
 	int n;
 
-	gt->fd = perf_i915_open(I915_PERF_RING_BUSY(0), -1);
+	gt->fd = perf_i915_open_group(I915_PERF_RING_BUSY(0), -1);
 	if (gt->fd < 0)
 		return -1;
 
-	if (perf_i915_open(I915_PERF_RING_WAIT(0), gt->fd) >= 0)
+	if (perf_i915_open_group(I915_PERF_RING_WAIT(0), gt->fd) >= 0)
 		gt->have_wait = 1;
 
-	if (perf_i915_open(I915_PERF_RING_SEMA(0), gt->fd) >= 0)
+	if (perf_i915_open_group(I915_PERF_RING_SEMA(0), gt->fd) >= 0)
 		gt->have_sema = 1;
 
 	gt->ring[0].name = names[0];
 	gt->num_rings = 1;
 
 	for (n = 1; names[n]; n++) {
-		if (perf_i915_open(I915_PERF_RING_BUSY(n), gt->fd) >= 0) {
+		if (perf_i915_open_group(I915_PERF_RING_BUSY(n), gt->fd) >= 0) {
 			if (gt->have_wait &&
-			    perf_i915_open(I915_PERF_RING_WAIT(n), gt->fd) < 0)
+			    perf_i915_open_group(I915_PERF_RING_WAIT(n),
+						 gt->fd) < 0)
 				return -1;
 
 			if (gt->have_sema &&
-			    perf_i915_open(I915_PERF_RING_SEMA(n), gt->fd) < 0)
+			    perf_i915_open_group(I915_PERF_RING_SEMA(n),
+						 gt->fd) < 0)
 				return -1;
 
 			gt->ring[gt->num_rings++].name = names[n];
diff --git a/overlay/power.c b/overlay/power.c
index 84d860cae40c..dd4aec6bffd9 100644
--- a/overlay/power.c
+++ b/overlay/power.c
@@ -38,21 +38,6 @@
 
 /* XXX Is this exposed through RAPL? */
 
-static int perf_open(void)
-{
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof (attr));
-
-	attr.type = i915_type_id();
-	if (attr.type == 0)
-		return -1;
-	attr.config = I915_PERF_ENERGY;
-
-	attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
-	return perf_event_open(&attr, -1, 0, -1, 0);
-}
-
 int power_init(struct power *power)
 {
 	char buf[4096];
@@ -60,7 +45,7 @@ int power_init(struct power *power)
 
 	memset(power, 0, sizeof(*power));
 
-	power->fd = perf_open();
+	power->fd = perf_i915_open(I915_PERF_ENERGY);
 	if (power->fd != -1)
 		return 0;
 
diff --git a/overlay/rc6.c b/overlay/rc6.c
index 3175bb22308f..46c975a557ff 100644
--- a/overlay/rc6.c
+++ b/overlay/rc6.c
@@ -35,24 +35,6 @@
 
 #include "rc6.h"
 
-static int perf_i915_open(int config, int group)
-{
-	struct perf_event_attr attr;
-
-	memset(&attr, 0, sizeof (attr));
-
-	attr.type = i915_type_id();
-	if (attr.type == 0)
-		return -ENOENT;
-	attr.config = config;
-
-	attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
-	if (group == -1)
-		attr.read_format |= PERF_FORMAT_GROUP;
-
-	return perf_event_open(&attr, -1, 0, group, 0);
-}
-
 #define RC6	(1<<0)
 #define RC6p	(1<<1)
 #define RC6pp	(1<<2)
@@ -61,15 +43,15 @@ static int perf_open(unsigned *flags)
 {
 	int fd;
 
-	fd = perf_i915_open(I915_PERF_RC6_RESIDENCY, -1);
+	fd = perf_i915_open_group(I915_PERF_RC6_RESIDENCY, -1);
 	if (fd < 0)
 		return -1;
 
 	*flags |= RC6;
-	if (perf_i915_open(I915_PERF_RC6p_RESIDENCY, fd) >= 0)
+	if (perf_i915_open_group(I915_PERF_RC6p_RESIDENCY, fd) >= 0)
 		*flags |= RC6p;
 
-	if (perf_i915_open(I915_PERF_RC6pp_RESIDENCY, fd) >= 0)
+	if (perf_i915_open_group(I915_PERF_RC6pp_RESIDENCY, fd) >= 0)
 		*flags |= RC6pp;
 
 	return fd;
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH i-g-t 3/7] intel-gpu-overlay: Fix interrupts PMU readout
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
  2017-09-25 15:14 ` [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library Tvrtko Ursulin
  2017-09-25 15:14 ` [PATCH i-g-t 2/7] intel-gpu-overlay: Consolidate perf PMU access to library Tvrtko Ursulin
@ 2017-09-25 15:14 ` Tvrtko Ursulin
  2017-09-25 15:14 ` [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU Tvrtko Ursulin
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:14 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 overlay/gem-interrupts.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/overlay/gem-interrupts.c b/overlay/gem-interrupts.c
index a84aef0398a7..3eda24f4d7eb 100644
--- a/overlay/gem-interrupts.c
+++ b/overlay/gem-interrupts.c
@@ -136,8 +136,12 @@ int gem_interrupts_update(struct gem_interrupts *irqs)
 		else
 			val = ret;
 	} else {
-		if (read(irqs->fd, &val, sizeof(val)) < 0)
+		uint64_t data[2];
+
+		if (read(irqs->fd, &data, sizeof(data)) < 0)
 			return irqs->error = errno;
+
+		val = data[0];
 	}
 
 	update = irqs->last_count == 0;
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
                   ` (2 preceding siblings ...)
  2017-09-25 15:14 ` [PATCH i-g-t 3/7] intel-gpu-overlay: Fix interrupts PMU readout Tvrtko Ursulin
@ 2017-09-25 15:14 ` Tvrtko Ursulin
  2017-09-25 15:31   ` Chris Wilson
  2017-09-25 15:15 ` [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API Tvrtko Ursulin
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:14 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 lib/igt_perf.h           | 93 ++++++++++++++++++++++++++++++++++--------------
 overlay/gem-interrupts.c |  2 +-
 overlay/gpu-freq.c       |  4 +--
 overlay/gpu-top.c        | 68 +++++++++++++++++++----------------
 overlay/power.c          |  4 +--
 overlay/rc6.c            |  6 ++--
 6 files changed, 111 insertions(+), 66 deletions(-)

diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index 8e674c3a3755..e29216f0500a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -1,3 +1,27 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
 #ifndef I915_PERF_H
 #define I915_PERF_H
 
@@ -5,41 +29,56 @@
 
 #include <linux/perf_event.h>
 
-#define I915_SAMPLE_BUSY	0
-#define I915_SAMPLE_WAIT	1
-#define I915_SAMPLE_SEMA	2
+enum drm_i915_gem_engine_class {
+	I915_ENGINE_CLASS_OTHER = 0,
+	I915_ENGINE_CLASS_RENDER = 1,
+	I915_ENGINE_CLASS_COPY = 2,
+	I915_ENGINE_CLASS_VIDEO = 3,
+	I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+	I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
+enum drm_i915_pmu_engine_sample {
+	I915_SAMPLE_QUEUED = 0,
+	I915_SAMPLE_BUSY = 1,
+	I915_SAMPLE_WAIT = 2,
+	I915_SAMPLE_SEMA = 3,
+	I915_ENGINE_SAMPLE_MAX /* non-ABI */
+};
+
+#define I915_PMU_SAMPLE_BITS (4)
+#define I915_PMU_SAMPLE_MASK (0xf)
+#define I915_PMU_SAMPLE_INSTANCE_BITS (8)
+#define I915_PMU_CLASS_SHIFT \
+	(I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS)
 
-#define I915_SAMPLE_RCS		0
-#define I915_SAMPLE_VCS		1
-#define I915_SAMPLE_BCS		2
-#define I915_SAMPLE_VECS	3
+#define __I915_PMU_ENGINE(class, instance, sample) \
+	((class) << I915_PMU_CLASS_SHIFT | \
+	(instance) << I915_PMU_SAMPLE_BITS | \
+	(sample))
 
-#define __I915_PERF_COUNT(ring, id) ((ring) << 4 | (id))
+#define I915_PMU_ENGINE_QUEUED(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_QUEUED)
 
-#define I915_PERF_COUNT_RCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_BUSY)
-#define I915_PERF_COUNT_RCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_WAIT)
-#define I915_PERF_COUNT_RCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_SEMA)
+#define I915_PMU_ENGINE_BUSY(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
 
-#define I915_PERF_COUNT_VCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_BUSY)
-#define I915_PERF_COUNT_VCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_WAIT)
-#define I915_PERF_COUNT_VCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_SEMA)
+#define I915_PMU_ENGINE_WAIT(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
 
-#define I915_PERF_COUNT_BCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_BUSY)
-#define I915_PERF_COUNT_BCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_WAIT)
-#define I915_PERF_COUNT_BCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_SEMA)
+#define I915_PMU_ENGINE_SEMA(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
 
-#define I915_PERF_COUNT_VECS_BUSY __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_BUSY)
-#define I915_PERF_COUNT_VECS_WAIT __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_WAIT)
-#define I915_PERF_COUNT_VECS_SEMA __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_SEMA)
+#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
 
-#define I915_PERF_ACTUAL_FREQUENCY 32
-#define I915_PERF_REQUESTED_FREQUENCY 33
-#define I915_PERF_ENERGY 34
-#define I915_PERF_INTERRUPTS 35
+#define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)
+#define I915_PMU_REQUESTED_FREQUENCY	__I915_PMU_OTHER(1)
+#define I915_PMU_INTERRUPTS		__I915_PMU_OTHER(2)
+#define I915_PMU_RC6_RESIDENCY		__I915_PMU_OTHER(3)
+#define I915_PMU_RC6p_RESIDENCY		__I915_PMU_OTHER(4)
+#define I915_PMU_RC6pp_RESIDENCY	__I915_PMU_OTHER(5)
 
-#define I915_PERF_RC6_RESIDENCY		40
-#define I915_PERF_RC6p_RESIDENCY	41
-#define I915_PERF_RC6pp_RESIDENCY	42
+#define I915_PMU_LAST I915_PMU_RC6pp_RESIDENCY
 
 static inline int
 perf_event_open(struct perf_event_attr *attr,
diff --git a/overlay/gem-interrupts.c b/overlay/gem-interrupts.c
index 3eda24f4d7eb..add4a9dfd725 100644
--- a/overlay/gem-interrupts.c
+++ b/overlay/gem-interrupts.c
@@ -113,7 +113,7 @@ int gem_interrupts_init(struct gem_interrupts *irqs)
 {
 	memset(irqs, 0, sizeof(*irqs));
 
-	irqs->fd = perf_i915_open(I915_PERF_INTERRUPTS);
+	irqs->fd = perf_i915_open(I915_PMU_INTERRUPTS);
 	if (irqs->fd < 0 && interrupts_read() < 0)
 		irqs->error = ENODEV;
 
diff --git a/overlay/gpu-freq.c b/overlay/gpu-freq.c
index 76c5ed9acfd1..c4619b87242a 100644
--- a/overlay/gpu-freq.c
+++ b/overlay/gpu-freq.c
@@ -37,8 +37,8 @@ static int perf_open(void)
 {
 	int fd;
 
-	fd = perf_i915_open_group(I915_PERF_ACTUAL_FREQUENCY, -1);
-	if (perf_i915_open_group(I915_PERF_REQUESTED_FREQUENCY, fd) < 0) {
+	fd = perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, -1);
+	if (perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, fd) < 0) {
 		close(fd);
 		fd = -1;
 	}
diff --git a/overlay/gpu-top.c b/overlay/gpu-top.c
index 812f47d5aced..61b8f62fd78c 100644
--- a/overlay/gpu-top.c
+++ b/overlay/gpu-top.c
@@ -43,49 +43,57 @@
 #define   RING_WAIT		(1<<11)
 #define   RING_WAIT_SEMAPHORE	(1<<10)
 
-#define __I915_PERF_RING(n) (4*n)
-#define I915_PERF_RING_BUSY(n) (__I915_PERF_RING(n) + 0)
-#define I915_PERF_RING_WAIT(n) (__I915_PERF_RING(n) + 1)
-#define I915_PERF_RING_SEMA(n) (__I915_PERF_RING(n) + 2)
-
 static int perf_init(struct gpu_top *gt)
 {
-	const char *names[] = {
-		"RCS",
-		"BCS",
-		"VCS0",
-		"VCS1",
-		NULL,
+	struct engine_desc {
+		unsigned class, inst;
+		const char *name;
+	} *d, engines[] = {
+		{ I915_ENGINE_CLASS_RENDER, 0, "rcs0" },
+		{ I915_ENGINE_CLASS_COPY, 0, "bcs0" },
+		{ I915_ENGINE_CLASS_VIDEO, 0, "vcs0" },
+		{ I915_ENGINE_CLASS_VIDEO, 1, "vcs1" },
+		{ I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, "vecs0" },
+		{ 0, 0, NULL }
 	};
-	int n;
 
-	gt->fd = perf_i915_open_group(I915_PERF_RING_BUSY(0), -1);
+	d = &engines[0];
+
+	gt->fd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class, d->inst),
+				      -1);
 	if (gt->fd < 0)
 		return -1;
 
-	if (perf_i915_open_group(I915_PERF_RING_WAIT(0), gt->fd) >= 0)
+	if (perf_i915_open_group(I915_PMU_ENGINE_WAIT(d->class, d->inst),
+				 gt->fd) >= 0)
 		gt->have_wait = 1;
 
-	if (perf_i915_open_group(I915_PERF_RING_SEMA(0), gt->fd) >= 0)
+	if (perf_i915_open_group(I915_PMU_ENGINE_SEMA(d->class, d->inst),
+				 gt->fd) >= 0)
 		gt->have_sema = 1;
 
-	gt->ring[0].name = names[0];
+	gt->ring[0].name = d->name;
 	gt->num_rings = 1;
 
-	for (n = 1; names[n]; n++) {
-		if (perf_i915_open_group(I915_PERF_RING_BUSY(n), gt->fd) >= 0) {
-			if (gt->have_wait &&
-			    perf_i915_open_group(I915_PERF_RING_WAIT(n),
-						 gt->fd) < 0)
-				return -1;
-
-			if (gt->have_sema &&
-			    perf_i915_open_group(I915_PERF_RING_SEMA(n),
-						 gt->fd) < 0)
-				return -1;
-
-			gt->ring[gt->num_rings++].name = names[n];
-		}
+	for (d++; d->name; d++) {
+		if (perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
+							      d->inst),
+					gt->fd) < 0)
+			continue;
+
+		if (gt->have_wait &&
+		    perf_i915_open_group(I915_PMU_ENGINE_WAIT(d->class,
+							      d->inst),
+					 gt->fd) < 0)
+			return -1;
+
+		if (gt->have_sema &&
+		    perf_i915_open_group(I915_PMU_ENGINE_SEMA(d->class,
+							      d->inst),
+				   gt->fd) < 0)
+			return -1;
+
+		gt->ring[gt->num_rings++].name = d->name;
 	}
 
 	return 0;
diff --git a/overlay/power.c b/overlay/power.c
index dd4aec6bffd9..805f4ca7805c 100644
--- a/overlay/power.c
+++ b/overlay/power.c
@@ -45,9 +45,7 @@ int power_init(struct power *power)
 
 	memset(power, 0, sizeof(*power));
 
-	power->fd = perf_i915_open(I915_PERF_ENERGY);
-	if (power->fd != -1)
-		return 0;
+	power->fd = -1;
 
 	sprintf(buf, "%s/i915_energy_uJ", debugfs_dri_path);
 	fd = open(buf, 0);
diff --git a/overlay/rc6.c b/overlay/rc6.c
index 46c975a557ff..29aa29dbaf72 100644
--- a/overlay/rc6.c
+++ b/overlay/rc6.c
@@ -43,15 +43,15 @@ static int perf_open(unsigned *flags)
 {
 	int fd;
 
-	fd = perf_i915_open_group(I915_PERF_RC6_RESIDENCY, -1);
+	fd = perf_i915_open_group(I915_PMU_RC6_RESIDENCY, -1);
 	if (fd < 0)
 		return -1;
 
 	*flags |= RC6;
-	if (perf_i915_open_group(I915_PERF_RC6p_RESIDENCY, fd) >= 0)
+	if (perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd) >= 0)
 		*flags |= RC6p;
 
-	if (perf_i915_open_group(I915_PERF_RC6pp_RESIDENCY, fd) >= 0)
+	if (perf_i915_open_group(I915_PMU_RC6pp_RESIDENCY, fd) >= 0)
 		*flags |= RC6pp;
 
 	return fd;
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
                   ` (3 preceding siblings ...)
  2017-09-25 15:14 ` [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU Tvrtko Ursulin
@ 2017-09-25 15:15 ` Tvrtko Ursulin
  2017-09-25 16:21   ` Chris Wilson
  2017-09-25 15:15 ` [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers Tvrtko Ursulin
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:15 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

A bunch of tests for the new i915 PMU feature.

Parts of the code were initialy sketched by Dmitry Rogozhkin.

v2: (Most suggestions by Chris Wilson)
 * Add new class/instance based engine list.
 * Add gem_has_engine/gem_require_engine to work with class/instance.
 * Use the above two throughout the test.
 * Shorten tests to 100ms busy batches, seems enough.
 * Add queued counter sanity checks.
 * Use igt_nsec_elapsed.
 * Skip on perf -ENODEV in some tests instead of embedding knowledge locally.
 * Fix multi ordering for busy accounting.
 * Use new guranteed_usleep when sleep time is asserted on.
 * Check for no queued when idle/busy.
 * Add queued counter init test.
 * Add queued tests.
 * Consolidate and increase multiple busy engines tests to most-busy and
   all-busy tests.
 * Guarantte interrupts by using fences.
 * Test RC6 via forcewake.

v3:
 * Tweak assert in interrupts subtest.
 * Sprinkle of comments.
 * Fix multi-client test which got broken in v2.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 lib/igt_gt.c           |  50 +++
 lib/igt_gt.h           |  38 +++
 lib/igt_perf.h         |   9 +-
 tests/Makefile.sources |   1 +
 tests/perf_pmu.c       | 869 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 959 insertions(+), 8 deletions(-)
 create mode 100644 tests/perf_pmu.c

diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index b3f3b3809eee..4c75811fb1b3 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -568,3 +568,53 @@ bool gem_can_store_dword(int fd, unsigned int engine)
 
 	return true;
 }
+
+const struct intel_execution_engine2 intel_execution_engines2[] = {
+	{ "rcs0", I915_ENGINE_CLASS_RENDER, 0 },
+	{ "bcs0", I915_ENGINE_CLASS_COPY, 0 },
+	{ "vcs0", I915_ENGINE_CLASS_VIDEO, 0 },
+	{ "vcs1", I915_ENGINE_CLASS_VIDEO, 1 },
+	{ "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0 },
+};
+
+unsigned int
+gem_class_instance_to_eb_flags(int gem_fd,
+			       enum drm_i915_gem_engine_class class,
+			       unsigned int instance)
+{
+	if (class != I915_ENGINE_CLASS_VIDEO)
+		igt_assert(instance == 0);
+	else
+		igt_assert(instance >= 0 && instance <= 1);
+
+	switch (class) {
+	case I915_ENGINE_CLASS_RENDER:
+		return I915_EXEC_RENDER;
+	case I915_ENGINE_CLASS_COPY:
+		return I915_EXEC_BLT;
+	case I915_ENGINE_CLASS_VIDEO:
+		if (instance == 0) {
+			if (gem_has_bsd2(gem_fd))
+				return I915_EXEC_BSD | I915_EXEC_BSD_RING1;
+			else
+				return I915_EXEC_BSD;
+
+		} else {
+			return I915_EXEC_BSD | I915_EXEC_BSD_RING2;
+		}
+	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
+		return I915_EXEC_VEBOX;
+	case I915_ENGINE_CLASS_OTHER:
+	default:
+		igt_assert(0);
+	};
+}
+
+bool gem_has_engine(int gem_fd,
+		    enum drm_i915_gem_engine_class class,
+		    unsigned int instance)
+{
+	return gem_has_ring(gem_fd,
+			    gem_class_instance_to_eb_flags(gem_fd, class,
+							   instance));
+}
diff --git a/lib/igt_gt.h b/lib/igt_gt.h
index 2579cbd37be7..fb67ae1a7d1f 100644
--- a/lib/igt_gt.h
+++ b/lib/igt_gt.h
@@ -25,6 +25,7 @@
 #define IGT_GT_H
 
 #include "igt_debugfs.h"
+#include "igt_core.h"
 
 void igt_require_hang_ring(int fd, int ring);
 
@@ -80,4 +81,41 @@ extern const struct intel_execution_engine {
 
 bool gem_can_store_dword(int fd, unsigned int engine);
 
+extern const struct intel_execution_engine2 {
+	const char *name;
+	int class;
+	int instance;
+} intel_execution_engines2[];
+
+#define for_each_engine_class_instance(fd__, e__) \
+	for ((e__) = intel_execution_engines2;\
+	     (e__)->name; \
+	     (e__)++)
+
+enum drm_i915_gem_engine_class {
+	I915_ENGINE_CLASS_OTHER = 0,
+	I915_ENGINE_CLASS_RENDER = 1,
+	I915_ENGINE_CLASS_COPY = 2,
+	I915_ENGINE_CLASS_VIDEO = 3,
+	I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+	I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
+unsigned int
+gem_class_instance_to_eb_flags(int gem_fd,
+			       enum drm_i915_gem_engine_class class,
+			       unsigned int instance);
+
+bool gem_has_engine(int gem_fd,
+		    enum drm_i915_gem_engine_class class,
+		    unsigned int instance);
+
+static inline
+void gem_require_engine(int gem_fd,
+			enum drm_i915_gem_engine_class class,
+			unsigned int instance)
+{
+	igt_require(gem_has_engine(gem_fd, class, instance));
+}
+
 #endif /* IGT_GT_H */
diff --git a/lib/igt_perf.h b/lib/igt_perf.h
index e29216f0500a..d64e0bd7a06a 100644
--- a/lib/igt_perf.h
+++ b/lib/igt_perf.h
@@ -29,14 +29,7 @@
 
 #include <linux/perf_event.h>
 
-enum drm_i915_gem_engine_class {
-	I915_ENGINE_CLASS_OTHER = 0,
-	I915_ENGINE_CLASS_RENDER = 1,
-	I915_ENGINE_CLASS_COPY = 2,
-	I915_ENGINE_CLASS_VIDEO = 3,
-	I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
-	I915_ENGINE_CLASS_MAX /* non-ABI */
-};
+#include "igt_gt.h"
 
 enum drm_i915_pmu_engine_sample {
 	I915_SAMPLE_QUEUED = 0,
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 0adc28a014d2..7d1fdf16892d 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -217,6 +217,7 @@ TESTS_progs = \
 	kms_vblank \
 	meta_test \
 	perf \
+	perf_pmu \
 	pm_backlight \
 	pm_lpsp \
 	pm_rc6_residency \
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
new file mode 100644
index 000000000000..79b444977128
--- /dev/null
+++ b/tests/perf_pmu.c
@@ -0,0 +1,869 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/times.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <time.h>
+#include <poll.h>
+
+#include "igt.h"
+#include "igt_core.h"
+#include "igt_perf.h"
+
+IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");
+
+const double tolerance = 0.02f;
+const unsigned long batch_duration_ns = 100 * 1000 * 1000;
+
+static int open_pmu(uint64_t config)
+{
+	int fd;
+
+	fd = perf_i915_open(config);
+	igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
+	igt_assert(fd >= 0);
+
+	return fd;
+}
+
+static int open_group(uint64_t config, int group)
+{
+	int fd;
+
+	fd = perf_i915_open_group(config, group);
+	igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
+	igt_assert(fd >= 0);
+
+	return fd;
+}
+
+static void
+init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample)
+{
+	int fd;
+
+	fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
+
+	close(fd);
+}
+
+static uint64_t pmu_read_single(int fd)
+{
+	uint64_t data[2];
+	ssize_t len;
+
+	len = read(fd, data, sizeof(data));
+	igt_assert_eq(len, sizeof(data));
+
+	return data[0];
+}
+
+static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
+{
+	uint64_t buf[2 + num];
+	unsigned int i;
+	ssize_t len;
+
+	len = read(fd, buf, sizeof(buf));
+	igt_assert_eq(len, sizeof(buf));
+	for (i = 0; i < num; i++)
+		val[i] = buf[2 + i];
+}
+
+#define assert_within_epsilon(x, ref, tolerance) \
+	igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
+		     (double)(x) >= (1.0 - tolerance) * (double)ref, \
+		     "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
+		     #x, #ref, (double)x, tolerance * 100.0, (double)ref)
+
+/*
+ * Helper for cases where we assert on time spent sleeping (directly or
+ * indirectly), so make it more robust by ensuring the system sleep time
+ * is within test tolerance to start with.
+ */
+static void guaranteed_usleep(unsigned int usec)
+{
+	uint64_t slept = 0, to_sleep = usec;
+
+	while (usec > 0) {
+		struct timespec start = { };
+		uint64_t this_sleep;
+
+		igt_nsec_elapsed(&start);
+		usleep(usec);
+		this_sleep = igt_nsec_elapsed(&start) / 1000;
+		slept += this_sleep;
+		if (this_sleep > usec)
+			break;
+		usec -= this_sleep;
+	}
+
+	assert_within_epsilon(slept, to_sleep, tolerance);
+}
+
+static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
+{
+	return gem_class_instance_to_eb_flags(gem_fd, e->class, e->instance);
+}
+
+static void
+single(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample,
+       bool busy)
+{
+	double ref = busy && sample == I915_SAMPLE_BUSY ?
+		     batch_duration_ns : 0.0f;
+	igt_spin_t *spin;
+	uint64_t val;
+	int fd;
+
+	fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
+
+	if (busy) {
+		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		igt_spin_batch_set_timeout(spin, batch_duration_ns);
+	} else {
+		guaranteed_usleep(batch_duration_ns / 1000);
+	}
+
+	if (busy)
+		gem_sync(gem_fd, spin->handle);
+
+	val = pmu_read_single(fd);
+
+	assert_within_epsilon(val, ref, tolerance);
+
+	if (busy)
+		igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+queued(int gem_fd, const struct intel_execution_engine2 *e)
+{
+	igt_spin_t *spin[2];
+	uint64_t val;
+	int fd;
+
+	fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
+
+	/*
+	 * First spin batch will be immediately executing.
+	 */
+	spin[0] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	igt_spin_batch_set_timeout(spin[0], batch_duration_ns);
+
+	/*
+	 * Second spin batch will sit in the execution queue behind the
+	 * first one so must cause the PMU to correctly report the queued
+	 * counter.
+	 */
+	spin[1] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	igt_spin_batch_set_timeout(spin[1], batch_duration_ns);
+
+	gem_sync(gem_fd, spin[0]->handle);
+
+	val = pmu_read_single(fd);
+	assert_within_epsilon(val, batch_duration_ns, tolerance);
+
+	gem_sync(gem_fd, spin[1]->handle);
+
+	igt_spin_batch_free(gem_fd, spin[0]);
+	igt_spin_batch_free(gem_fd, spin[1]);
+	close(fd);
+}
+
+static void
+busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
+	       const unsigned int num_engines)
+{
+	const struct intel_execution_engine2 *e_;
+	uint64_t val[num_engines];
+	int fd[num_engines];
+	igt_spin_t *spin;
+	unsigned int busy_idx, i;
+
+	i = 0;
+	fd[0] = -1;
+	for_each_engine_class_instance(fd, e_) {
+		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
+			continue;
+		else if (e == e_)
+			busy_idx = i;
+
+		fd[i++] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
+							  e_->instance),
+				     fd[0]);
+	}
+
+	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	igt_spin_batch_set_timeout(spin, batch_duration_ns);
+
+	gem_sync(gem_fd, spin->handle);
+
+	pmu_read_multi(fd[0], num_engines, val);
+
+	assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
+	for (i = 0; i < num_engines; i++) {
+		if (i == busy_idx)
+			continue;
+		assert_within_epsilon(val[i], 0.0f, tolerance);
+	}
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd[0]);
+}
+
+static void
+most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
+		    const unsigned int num_engines)
+{
+	const struct intel_execution_engine2 *e_;
+	uint64_t val[num_engines];
+	int fd[num_engines];
+	igt_spin_t *spin[num_engines];
+	unsigned int idle_idx, i;
+
+	gem_require_engine(gem_fd, e->class, e->instance);
+
+	i = 0;
+	fd[0] = -1;
+	for_each_engine_class_instance(fd, e_) {
+		if (!gem_has_engine(gem_fd, e_->class, e_->instance))
+			continue;
+
+		fd[i] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
+							e_->instance),
+				   fd[0]);
+
+		if (e == e_) {
+			idle_idx = i;
+		} else {
+			spin[i] = igt_spin_batch_new(gem_fd, 0,
+						     e2ring(gem_fd, e_), 0);
+			igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
+		}
+
+		i++;
+	}
+
+	for (i = 0; i < num_engines; i++) {
+		if (i != idle_idx)
+			gem_sync(gem_fd, spin[i]->handle);
+	}
+
+	pmu_read_multi(fd[0], num_engines, val);
+
+	for (i = 0; i < num_engines; i++) {
+		if (i == idle_idx)
+			assert_within_epsilon(val[i], 0.0f, tolerance);
+		else
+			assert_within_epsilon(val[i], batch_duration_ns,
+					      tolerance);
+	}
+
+	for (i = 0; i < num_engines; i++) {
+		if (i != idle_idx)
+			igt_spin_batch_free(gem_fd, spin[i]);
+	}
+	close(fd[0]);
+}
+
+static void
+all_busy_check_all(int gem_fd, const unsigned int num_engines)
+{
+	const struct intel_execution_engine2 *e;
+	uint64_t val[num_engines];
+	int fd[num_engines];
+	igt_spin_t *spin[num_engines];
+	unsigned int i;
+
+	i = 0;
+	fd[0] = -1;
+	for_each_engine_class_instance(fd, e) {
+		if (!gem_has_engine(gem_fd, e->class, e->instance))
+			continue;
+
+		fd[i] = open_group(I915_PMU_ENGINE_BUSY(e->class, e->instance),
+				   fd[0]);
+
+		spin[i] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
+
+		i++;
+	}
+
+	for (i = 0; i < num_engines; i++)
+		gem_sync(gem_fd, spin[i]->handle);
+
+	pmu_read_multi(fd[0], num_engines, val);
+
+	for (i = 0; i < num_engines; i++)
+		assert_within_epsilon(val[i], batch_duration_ns, tolerance);
+
+	for (i = 0; i < num_engines; i++)
+		igt_spin_batch_free(gem_fd, spin[i]);
+	close(fd[0]);
+}
+
+static void
+no_sema(int gem_fd, const struct intel_execution_engine2 *e, bool busy)
+{
+	igt_spin_t *spin;
+	uint64_t val[2];
+	int fd;
+
+	fd = open_group(I915_PMU_ENGINE_SEMA(e->class, e->instance), -1);
+	open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
+
+	if (busy) {
+		spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+		igt_spin_batch_set_timeout(spin, batch_duration_ns);
+	} else {
+		usleep(batch_duration_ns / 1000);
+	}
+
+	pmu_read_multi(fd, 2, val);
+
+	assert_within_epsilon(val[0], 0.0f, tolerance);
+	assert_within_epsilon(val[1], 0.0f, tolerance);
+
+	if (busy)
+		igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+multi_client(int gem_fd, const struct intel_execution_engine2 *e)
+{
+	uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
+	igt_spin_t *spin;
+	uint64_t val[2];
+	int fd[2];
+
+	fd[0] = open_pmu(config);
+
+	spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
+	igt_spin_batch_set_timeout(spin, batch_duration_ns);
+
+	guaranteed_usleep(batch_duration_ns / 3000);
+
+	/*
+	 * Second PMU client which is initialized after the first one,
+	 * and exists before it, should not affect accounting as reported
+	 * in the first client.
+	 */
+	fd[1] = open_pmu(config);
+	guaranteed_usleep(batch_duration_ns / 3000);
+	val[1] = pmu_read_single(fd[1]);
+	close(fd[1]);
+
+	gem_sync(gem_fd, spin->handle);
+
+	val[0] = pmu_read_single(fd[0]);
+
+	assert_within_epsilon(val[0], batch_duration_ns, tolerance);
+	assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd[0]);
+}
+
+/**
+ * Tests that i915 PMU corectly errors out in invalid initialization.
+ * i915 PMU is uncore PMU, thus:
+ *  - sampling period is not supported
+ *  - pid > 0 is not supported since we can't count per-process (we count
+ *    per whole system)
+ *  - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0
+ */
+static void invalid_init(void)
+{
+	struct perf_event_attr attr;
+	int pid, cpu;
+
+#define ATTR_INIT() \
+do { \
+	memset(&attr, 0, sizeof (attr)); \
+	attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \
+	attr.type = i915_type_id(); \
+	igt_assert(attr.type != 0); \
+} while(0)
+
+	ATTR_INIT();
+	attr.sample_period = 100;
+	pid = -1;
+	cpu = 0;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, EINVAL);
+
+	ATTR_INIT();
+	pid = 0;
+	cpu = 0;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, EINVAL);
+
+	ATTR_INIT();
+	pid = -1;
+	cpu = 1;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, ENODEV);
+}
+
+static void init_other(unsigned int i, bool valid)
+{
+	int fd;
+
+	fd = perf_i915_open(__I915_PMU_OTHER(i));
+	igt_require(!(fd < 0 && errno == ENODEV));
+	if (valid) {
+		igt_assert(fd >= 0);
+	} else {
+		igt_assert(fd < 0);
+		return;
+	}
+
+	close(fd);
+}
+
+static void read_other(unsigned int i, bool valid)
+{
+	int fd;
+
+	fd = perf_i915_open(__I915_PMU_OTHER(i));
+	igt_require(!(fd < 0 && errno == ENODEV));
+	if (valid) {
+		igt_assert(fd >= 0);
+	} else {
+		igt_assert(fd < 0);
+		return;
+	}
+
+	(void)pmu_read_single(fd);
+
+	close(fd);
+}
+
+static bool cpu0_hotplug_support(void)
+{
+	int fd = open("/sys/devices/system/cpu/cpu0/online", O_WRONLY);
+
+	close(fd);
+
+	return fd > 0;
+}
+
+static void cpu_hotplug(int gem_fd)
+{
+	struct timespec start = { };
+	igt_spin_t *spin;
+	uint64_t val, ref;
+	int fd;
+
+	igt_require(cpu0_hotplug_support());
+
+	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
+	igt_assert(fd >= 0);
+
+	igt_nsec_elapsed(&start);
+
+	/*
+	 * Toggle online status of all the CPUs in a child process and ensure
+	 * this has not affected busyness stats in the parent.
+	 */
+	igt_fork(child, 1) {
+		int cpu = 0;
+
+		for (;;) {
+			char name[128];
+			int cpufd;
+
+			sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
+				cpu);
+			cpufd = open(name, O_WRONLY);
+			if (cpufd == -1) {
+				igt_assert(cpu > 0);
+				break;
+			}
+			igt_assert_eq(write(cpufd, "0", 2), 2);
+
+			usleep(1000 * 1000);
+
+			igt_assert_eq(write(cpufd, "1", 2), 2);
+
+			close(cpufd);
+			cpu++;
+		}
+	}
+
+	igt_waitchildren();
+
+	igt_spin_batch_end(spin);
+	gem_sync(gem_fd, spin->handle);
+
+	ref = igt_nsec_elapsed(&start);
+	val = pmu_read_single(fd);
+
+	assert_within_epsilon(val, ref, tolerance);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static int chain_nop(int gem_fd, int in_fence, bool sync)
+{
+	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_execbuffer2 eb =
+		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+	const uint32_t bbe = 0xa << 23;
+
+	obj.handle = gem_create(gem_fd, sizeof(bbe));
+	gem_write(gem_fd, obj.handle, 0, &bbe, sizeof(bbe));
+
+	eb.flags = I915_EXEC_RENDER | I915_EXEC_FENCE_OUT;
+
+	if (in_fence >= 0) {
+		eb.flags |= I915_EXEC_FENCE_IN;
+		eb.rsvd2 = in_fence;
+	}
+
+	gem_execbuf_wr(gem_fd, &eb);
+
+	if (sync)
+		gem_sync(gem_fd, obj.handle);
+
+	gem_close(gem_fd, obj.handle);
+	if (in_fence >= 0)
+		close(in_fence);
+
+	return eb.rsvd2 >> 32;
+}
+
+static void
+test_interrupts(int gem_fd)
+{
+	uint64_t idle, busy, prev;
+	int fd, fence = -1;
+	const unsigned int count = 1000;
+	unsigned int i;
+
+	fd = open_pmu(I915_PMU_INTERRUPTS);
+
+	gem_quiescent_gpu(gem_fd);
+
+	/* Wait for idle state. */
+	prev = pmu_read_single(fd);
+	idle = prev + 1;
+	while (idle != prev) {
+		usleep(batch_duration_ns / 1000);
+		prev = idle;
+		idle = pmu_read_single(fd);
+	}
+
+	igt_assert_eq(idle - prev, 0);
+
+	/* Send some no-op batches with chained fences to ensure interrupts. */
+	for (i = 1; i <= count; i++)
+		fence = chain_nop(gem_fd, fence, i < count ? false : true);
+	close(fence);
+
+	/* Check at least as many interrupts has been generated. */
+	busy = pmu_read_single(fd);
+	igt_assert(busy > count / 20);
+
+	close(fd);
+}
+
+static void
+test_frequency(int gem_fd)
+{
+	igt_spin_t *spin;
+	uint64_t idle[2], busy[2];
+	int fd;
+
+	fd = open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
+	open_group(I915_PMU_ACTUAL_FREQUENCY, fd);
+
+	gem_quiescent_gpu(gem_fd);
+	usleep(batch_duration_ns / 1000);
+
+	pmu_read_multi(fd, 2, idle);
+
+	/*
+	 * Put some load on the render engine and check that the requenst
+	 * and actual frequencies have gone up.
+	 *
+	 * FIXME This might not be guaranteed to work on all platforms.
+	 * How to detect those?
+	 */
+	spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+	igt_spin_batch_set_timeout(spin, batch_duration_ns);
+	gem_sync(gem_fd, spin->handle);
+
+	pmu_read_multi(fd, 2, busy);
+
+	igt_assert(busy[0] > idle[0]);
+	igt_assert(busy[1] > idle[1]);
+
+	igt_spin_batch_free(gem_fd, spin);
+	close(fd);
+}
+
+static void
+test_rc6(int gem_fd)
+{
+	int64_t duration_ns = 2 * 1000 * 1000 * 1000;
+	uint64_t idle, busy, prev;
+	int fd, fw;
+
+	fd = open_pmu(I915_PMU_RC6_RESIDENCY);
+
+	gem_quiescent_gpu(gem_fd);
+
+	/* Go idle and check full RC6. */
+	prev = pmu_read_single(fd);
+	guaranteed_usleep(duration_ns / 1000);
+	idle = pmu_read_single(fd);
+
+	assert_within_epsilon(idle - prev, duration_ns, tolerance);
+
+	/* Wake up device and check no RC6. */
+	fw = igt_open_forcewake_handle(gem_fd);
+	igt_assert(fw >= 0);
+
+	prev = pmu_read_single(fd);
+	guaranteed_usleep(duration_ns / 1000);
+	busy = pmu_read_single(fd);
+
+	assert_within_epsilon(busy - prev, 0.0, tolerance);
+
+	close(fw);
+	close(fd);
+}
+
+static void
+test_rc6p(int gem_fd)
+{
+	int64_t duration_ns = 2 * 1000 * 1000 * 1000;
+	unsigned int num_pmu = 1;
+	uint64_t idle[3], busy[3], prev[3];
+	unsigned int i;
+	int fd, ret, fw;
+
+	fd = open_group(I915_PMU_RC6_RESIDENCY, -1);
+	ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
+	if (ret > 0) {
+		num_pmu++;
+		ret = perf_i915_open_group(I915_PMU_RC6pp_RESIDENCY, fd);
+		if (ret > 0)
+			num_pmu++;
+	}
+
+	igt_require(num_pmu == 3);
+
+	gem_quiescent_gpu(gem_fd);
+
+	/* Go idle and check full RC6. */
+	pmu_read_multi(fd, num_pmu, prev);
+	guaranteed_usleep(duration_ns / 1000);
+	pmu_read_multi(fd, num_pmu, idle);
+
+	for (i = 0; i < num_pmu; i++)
+		assert_within_epsilon(idle[i] - prev[i], duration_ns,
+				      tolerance);
+
+	/* Wake up device and check no RC6. */
+	fw = igt_open_forcewake_handle(gem_fd);
+	igt_assert(fw >= 0);
+
+	pmu_read_multi(fd, num_pmu, prev);
+	guaranteed_usleep(duration_ns / 1000);
+	pmu_read_multi(fd, num_pmu, busy);
+
+	for (i = 0; i < num_pmu; i++)
+		assert_within_epsilon(busy[i] - prev[i], 0.0, tolerance);
+
+	close(fw);
+	close(fd);
+}
+
+igt_main
+{
+	const unsigned int num_other_metrics =
+				I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
+	unsigned int num_engines = 0;
+	int fd = -1;
+	const struct intel_execution_engine2 *e;
+	unsigned int i;
+
+	igt_fixture {
+		fd = drm_open_driver_master(DRIVER_INTEL);
+
+		igt_require_gem(fd);
+		igt_require(i915_type_id() > 0);
+
+		for_each_engine_class_instance(fd, e) {
+			if (gem_has_engine(fd, e->class, e->instance))
+				num_engines++;
+		}
+	}
+
+	/**
+	 * Test invalid access via perf API is rejected.
+	 */
+	igt_subtest("invalid-init")
+		invalid_init();
+
+	for_each_engine_class_instance(fd, e) {
+		/**
+		 * Test that a single engine metric can be initialized.
+		 */
+		igt_subtest_f("init-queued-%s", e->name)
+			init(fd, e, I915_SAMPLE_QUEUED);
+
+		igt_subtest_f("init-busy-%s", e->name)
+			init(fd, e, I915_SAMPLE_BUSY);
+
+		igt_subtest_f("init-wait-%s", e->name)
+			init(fd, e, I915_SAMPLE_WAIT);
+
+		igt_subtest_f("init-sema-%s", e->name)
+			init(fd, e, I915_SAMPLE_SEMA);
+
+		/**
+		 * Test that queued metric works.
+		 */
+		igt_subtest_f("queued-%s", e->name)
+			queued(fd, e);
+
+		/**
+		 * Test that engines show nothing queued when idle or busy.
+		 */
+		igt_subtest_f("idle-no-queued-%s", e->name)
+			single(fd, e, I915_SAMPLE_QUEUED, false);
+
+		igt_subtest_f("busy-no-queued-%s", e->name)
+			single(fd, e, I915_SAMPLE_QUEUED, true);
+
+		/**
+		 * Test that engines show no load when idle.
+		 */
+		igt_subtest_f("idle-%s", e->name)
+			single(fd, e, I915_SAMPLE_BUSY, false);
+
+		/**
+		 * Test that a single engine reports load correctly.
+		 */
+		igt_subtest_f("busy-%s", e->name)
+			single(fd, e, I915_SAMPLE_BUSY, true);
+
+		/**
+		 * Test that when one engine is loaded other report no load.
+		 */
+		igt_subtest_f("busy-check-all-%s", e->name)
+			busy_check_all(fd, e, num_engines);
+
+		/**
+		 * Test that when all except one engine are loaded all loads
+		 * are correctly reported.
+		 */
+		igt_subtest_f("most-busy-check-all-%s", e->name)
+			most_busy_check_all(fd, e, num_engines);
+
+		/**
+		 * Test that semphore counters report no activity on idle
+		 * or busy engines.
+		 */
+		igt_subtest_f("idle-no-semaphores-%s", e->name)
+			no_sema(fd, e, false);
+
+		igt_subtest_f("busy-no-semaphores-%s", e->name)
+			no_sema(fd, e, true);
+
+		/**
+		 * Check that two perf clients do not influence each others
+		 * observations.
+		 */
+		igt_subtest_f("multi-client-%s", e->name)
+			multi_client(fd, e);
+	}
+
+	/**
+	 * Test that when all engines are loaded all loads are
+	 * correctly reported.
+	 */
+	igt_subtest("all-busy-check-all")
+		all_busy_check_all(fd, num_engines);
+
+	/**
+	 * Test that non-engine counters can be initialized and read. Apart
+	 * from the invalid metric which should fail.
+	 */
+	for (i = 0; i < num_other_metrics + 1; i++) {
+		igt_subtest_f("other-init-%u", i)
+			init_other(i, i < num_other_metrics);
+
+		igt_subtest_f("other-read-%u", i)
+			read_other(i, i < num_other_metrics);
+	}
+
+	/**
+	 * Test counters are not affected by CPU offline/online events.
+	 */
+	igt_subtest("cpu-hotplug")
+		cpu_hotplug(fd);
+
+	/**
+	 * Test GPU frequency.
+	 */
+	igt_subtest("frequency")
+		test_frequency(fd);
+
+	/**
+	 * Test interrupt count reporting.
+	 */
+	igt_subtest("interrupts")
+		test_interrupts(fd);
+
+	/**
+	 * Test RC6 residency reporting.
+	 */
+	igt_subtest("rc6")
+		test_rc6(fd);
+
+	/**
+	 * Test RC6p residency reporting.
+	 */
+	igt_subtest("rc6p")
+		test_rc6p(fd);
+}
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
                   ` (4 preceding siblings ...)
  2017-09-25 15:15 ` [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API Tvrtko Ursulin
@ 2017-09-25 15:15 ` Tvrtko Ursulin
  2017-09-25 16:44   ` Chris Wilson
  2017-09-25 15:15 ` [PATCH i-g-t 7/7] media-bench.pl: Add busy balancers to the list Tvrtko Ursulin
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:15 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Add busy and busy-avg balancers which make balancing
decisions by looking at engine busyness via the i915 PMU.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 benchmarks/gem_wsim.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index 82fe6ba9ec5f..9ee91fabb220 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -50,6 +50,7 @@
 #include "intel_io.h"
 #include "igt_aux.h"
 #include "igt_rand.h"
+#include "igt_perf.h"
 #include "sw_sync.h"
 
 #include "ewma.h"
@@ -188,6 +189,16 @@ struct workload
 			uint32_t last[NUM_ENGINES];
 		} rt;
 	};
+
+	struct busy_balancer {
+		int fd;
+		bool first;
+		unsigned int num_engines;
+		unsigned int engine_map[5];
+		uint64_t t_prev;
+		uint64_t prev[5];
+		double busy[5];
+	} busy_balancer;
 };
 
 static const unsigned int nop_calibration_us = 1000;
@@ -993,6 +1004,8 @@ struct workload_balancer {
 	unsigned int flags;
 	unsigned int min_gen;
 
+	int (*init)(const struct workload_balancer *balancer,
+		    struct workload *wrk);
 	unsigned int (*get_qd)(const struct workload_balancer *balancer,
 			       struct workload *wrk,
 			       enum intel_engine_id engine);
@@ -1242,6 +1255,106 @@ context_balance(const struct workload_balancer *balancer,
 	return get_vcs_engine(wrk->ctx_list[w->context].static_vcs);
 }
 
+static unsigned int
+get_engine_busy(const struct workload_balancer *balancer,
+		struct workload *wrk, enum intel_engine_id engine)
+{
+	struct busy_balancer *bb = &wrk->busy_balancer;
+
+	if (engine == VCS2 && (wrk->flags & VCS2REMAP))
+		engine = BCS;
+
+	return bb->busy[bb->engine_map[engine]];
+}
+
+static void get_stats(const struct workload_balancer *b, struct workload *wrk)
+{
+	struct busy_balancer *bb = &wrk->busy_balancer;
+	uint64_t val[7];
+	unsigned int i;
+
+	igt_assert_eq(read(bb->fd, val, sizeof(val)), sizeof(val));
+
+	if (!bb->first) {
+		for (i = 0; i < bb->num_engines; i++) {
+			double d;
+
+			d = (val[2 + i] - bb->prev[i]) * 100;
+			d /= val[1] - bb->t_prev;
+			bb->busy[i] = d;
+		}
+	}
+
+	for (i = 0; i < bb->num_engines; i++)
+		bb->prev[i] = val[2 + i];
+
+	bb->t_prev = val[1];
+	bb->first = false;
+}
+
+static enum intel_engine_id
+busy_avg_balance(const struct workload_balancer *balancer,
+		 struct workload *wrk, struct w_step *w)
+{
+	get_stats(balancer, wrk);
+
+	return qdavg_balance(balancer, wrk, w);
+}
+
+static enum intel_engine_id
+busy_balance(const struct workload_balancer *balancer,
+	     struct workload *wrk, struct w_step *w)
+{
+	get_stats(balancer, wrk);
+
+	return qd_balance(balancer, wrk, w);
+}
+
+static int
+busy_init(const struct workload_balancer *balancer, struct workload *wrk)
+{
+	struct busy_balancer *bb = &wrk->busy_balancer;
+	struct engine_desc {
+		unsigned class, inst;
+		enum intel_engine_id id;
+	} *d, engines[] = {
+		{ I915_ENGINE_CLASS_RENDER, 0, RCS },
+		{ I915_ENGINE_CLASS_COPY, 0, BCS },
+		{ I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
+		{ I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
+		{ I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
+		{ 0, 0, VCS }
+	};
+
+	bb->num_engines = 0;
+	bb->first = true;
+	bb->fd = -1;
+
+	for (d = &engines[0]; d->id != VCS; d++) {
+		int pfd;
+
+		pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
+							        d->inst),
+					   bb->fd);
+		if (pfd < 0) {
+			if (d->id != VCS2)
+				return -(10 + bb->num_engines);
+			else
+				continue;
+		}
+
+		if (bb->num_engines == 0)
+			bb->fd = pfd;
+
+		bb->engine_map[d->id] = bb->num_engines++;
+	}
+
+	if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP))
+		return -1;
+
+	return 0;
+}
+
 static const struct workload_balancer all_balancers[] = {
 	{
 		.id = 0,
@@ -1315,6 +1428,22 @@ static const struct workload_balancer all_balancers[] = {
 		.desc = "Static round-robin VCS assignment at context creation.",
 		.balance = context_balance,
 	},
+	{
+		.id = 9,
+		.name = "busy",
+		.desc = "Engine busyness based balancing.",
+		.init = busy_init,
+		.get_qd = get_engine_busy,
+		.balance = busy_balance,
+	},
+	{
+		.id = 10,
+		.name = "busy-avg",
+		.desc = "Average engine busyness based balancing.",
+		.init = busy_init,
+		.get_qd = get_engine_busy,
+		.balance = busy_avg_balance,
+	},
 };
 
 static unsigned int
@@ -2226,6 +2355,17 @@ int main(int argc, char **argv)
 				    (verbose > 0 && master_workload == i);
 
 		prepare_workload(i, w[i], flags_);
+
+		if (balancer && balancer->init) {
+			int ret = balancer->init(balancer, w[i]);
+			if (ret) {
+				if (verbose)
+					fprintf(stderr,
+						"Failed to initialize balancing! (%u=%d)\n",
+						i, ret);
+				return 1;
+			}
+		}
 	}
 
 	gem_quiescent_gpu(fd);
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH i-g-t 7/7] media-bench.pl: Add busy balancers to the list
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
                   ` (5 preceding siblings ...)
  2017-09-25 15:15 ` [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers Tvrtko Ursulin
@ 2017-09-25 15:15 ` Tvrtko Ursulin
  2017-09-25 16:06 ` ✓ Fi.CI.BAT: success for IGT PMU support (rev4) Patchwork
  2017-09-25 22:16 ` ✓ Fi.CI.IGT: " Patchwork
  8 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-25 15:15 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 scripts/media-bench.pl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/media-bench.pl b/scripts/media-bench.pl
index 0956ef0a0621..78f45199e95d 100755
--- a/scripts/media-bench.pl
+++ b/scripts/media-bench.pl
@@ -47,8 +47,9 @@ my $nop;
 my %opts;
 
 my @balancers = ( 'rr', 'rand', 'qd', 'qdr', 'qdavg', 'rt', 'rtr', 'rtavg',
-		  'context' );
-my %bal_skip_H = ( 'rr' => 1, 'rand' => 1, 'context' => 1 );
+		  'context', 'busy', 'busy-avg' );
+my %bal_skip_H = ( 'rr' => 1, 'rand' => 1, 'context' => 1, , 'busy' => 1,
+		   'busy-avg' => 1 );
 my %bal_skip_R = ( 'context' => 1 );
 
 my @workloads = (
-- 
2.9.5

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library
  2017-09-25 15:14 ` [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library Tvrtko Ursulin
@ 2017-09-25 15:22   ` Chris Wilson
  2017-09-26 10:52     ` Tvrtko Ursulin
  0 siblings, 1 reply; 25+ messages in thread
From: Chris Wilson @ 2017-09-25 15:22 UTC (permalink / raw)
  To: Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-25 16:14:56)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  lib/Makefile.sources             | 2 ++
>  overlay/perf.c => lib/igt_perf.c | 2 +-
>  overlay/perf.h => lib/igt_perf.h | 2 ++
>  overlay/Makefile.am              | 6 ++----
>  overlay/gem-interrupts.c         | 3 ++-
>  overlay/gpu-freq.c               | 3 ++-
>  overlay/gpu-perf.c               | 3 ++-
>  overlay/gpu-top.c                | 3 ++-
>  overlay/power.c                  | 3 ++-
>  overlay/rc6.c                    | 3 ++-
>  10 files changed, 19 insertions(+), 11 deletions(-)
>  rename overlay/perf.c => lib/igt_perf.c (94%)
>  rename overlay/perf.h => lib/igt_perf.h (99%)
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 53fdb54cbfa5..c031cb502469 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -16,6 +16,8 @@ lib_source_list =             \
>         igt_gt.h                \
>         igt_gvt.c               \
>         igt_gvt.h               \
> +       igt_perf.c              \
> +       igt_perf.h              \
>         igt_primes.c            \
>         igt_primes.h            \
>         igt_rand.c              \
> diff --git a/overlay/perf.c b/lib/igt_perf.c
> similarity index 94%
> rename from overlay/perf.c
> rename to lib/igt_perf.c
> index b8fdc675c587..45cccff0ae53 100644
> --- a/overlay/perf.c
> +++ b/lib/igt_perf.c
> @@ -3,7 +3,7 @@
>  #include <unistd.h>
>  #include <stdlib.h>
>  
> -#include "perf.h"
> +#include "igt_perf.h"
>  
>  uint64_t i915_type_id(void)
>  {
> diff --git a/overlay/perf.h b/lib/igt_perf.h
> similarity index 99%
> rename from overlay/perf.h
> rename to lib/igt_perf.h
> index c44e65f9734c..a80b311cd1d1 100644
> --- a/overlay/perf.h
> +++ b/lib/igt_perf.h
> @@ -1,6 +1,8 @@
>  #ifndef I915_PERF_H
>  #define I915_PERF_H
>  
> +#include <stdint.h>
> +
>  #include <linux/perf_event.h>
>  
>  #define I915_SAMPLE_BUSY       0
> diff --git a/overlay/Makefile.am b/overlay/Makefile.am
> index 5472514efc16..c66a80f4e571 100644
> --- a/overlay/Makefile.am
> +++ b/overlay/Makefile.am
> @@ -4,8 +4,8 @@ endif
>  
>  AM_CPPFLAGS = -I.
>  AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) \
> -       $(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS)
> -LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS)
> +       $(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS) -I$(srcdir)/../lib
> +LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS) $(top_builddir)/lib/libintel_tools.la

It was a conscious decision that tools wouldn't pull in the test lib. We
are nowhere close to having a stable library api, i.e. a long way from
being installable. An issue if this is turned into a .so, which has
happened in a nearby universe.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 2/7] intel-gpu-overlay: Consolidate perf PMU access to library
  2017-09-25 15:14 ` [PATCH i-g-t 2/7] intel-gpu-overlay: Consolidate perf PMU access to library Tvrtko Ursulin
@ 2017-09-25 15:25   ` Chris Wilson
  0 siblings, 0 replies; 25+ messages in thread
From: Chris Wilson @ 2017-09-25 15:25 UTC (permalink / raw)
  To: Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-25 16:14:57)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  lib/igt_perf.c           | 33 +++++++++++++++++++++++++++++++++
>  lib/igt_perf.h           |  2 ++
>  overlay/gem-interrupts.c | 16 +---------------
>  overlay/gpu-freq.c       | 22 ++--------------------
>  overlay/gpu-top.c        | 32 ++++++++------------------------
>  overlay/power.c          | 17 +----------------
>  overlay/rc6.c            | 24 +++---------------------
>  7 files changed, 50 insertions(+), 96 deletions(-)
> 
> diff --git a/lib/igt_perf.c b/lib/igt_perf.c
> index 45cccff0ae53..0fa5ae3acb66 100644
> --- a/lib/igt_perf.c
> +++ b/lib/igt_perf.c
> @@ -2,6 +2,8 @@
>  #include <fcntl.h>
>  #include <unistd.h>
>  #include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
>  
>  #include "igt_perf.h"
>  
> @@ -24,3 +26,34 @@ uint64_t i915_type_id(void)
>         return strtoull(buf, 0, 0);
>  }
>  
> +static int _perf_open(int config, int group, int format)
> +{
> +       struct perf_event_attr attr;
> +
> +       memset(&attr, 0, sizeof (attr));
> +
> +       attr.type = i915_type_id();
> +       if (attr.type == 0)
> +               return -ENOENT;
> +
> +       attr.config = config;
> +
> +       if (group >= 0)
> +               format &= ~PERF_FORMAT_GROUP;
> +
> +       attr.read_format = format;
> +
> +       return perf_event_open(&attr, -1, 0, group, 0);
> +

Stray \n.

> +}
> +
> +int perf_i915_open(int config)
> +{
> +       return _perf_open(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED);
> +}
> +
> +int perf_i915_open_group(int config, int group)
> +{
> +       return _perf_open(config, group,
> +                         PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP);
> +}
> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
> index a80b311cd1d1..8e674c3a3755 100644
> --- a/lib/igt_perf.h
> +++ b/lib/igt_perf.h
> @@ -62,5 +62,7 @@ perf_event_open(struct perf_event_attr *attr,
>  }
>  
>  uint64_t i915_type_id(void);
> +int perf_i915_open(int config);
> +int perf_i915_open_group(int config, int group);
>  
>  #endif /* I915_PERF_H */
> diff --git a/overlay/gem-interrupts.c b/overlay/gem-interrupts.c
> index 7ba54fcd487d..a84aef0398a7 100644
> --- a/overlay/gem-interrupts.c
> +++ b/overlay/gem-interrupts.c
> @@ -36,20 +36,6 @@
>  #include "gem-interrupts.h"
>  #include "debugfs.h"
>  
> -static int perf_open(void)
> -{
> -       struct perf_event_attr attr;
> -
> -       memset(&attr, 0, sizeof (attr));
> -
> -       attr.type = i915_type_id();
> -       if (attr.type == 0)
> -               return -ENOENT;
> -       attr.config = I915_PERF_INTERRUPTS;
> -
> -       return perf_event_open(&attr, -1, 0, -1, 0);
> -}
> -
>  static long long debugfs_read(void)
>  {
>         char buf[8192], *b;
> @@ -127,7 +113,7 @@ int gem_interrupts_init(struct gem_interrupts *irqs)
>  {
>         memset(irqs, 0, sizeof(*irqs));
>  
> -       irqs->fd = perf_open();
> +       irqs->fd = perf_i915_open(I915_PERF_INTERRUPTS);
>         if (irqs->fd < 0 && interrupts_read() < 0)
>                 irqs->error = ENODEV;
>  
> diff --git a/overlay/gpu-freq.c b/overlay/gpu-freq.c
> index 7f29b1aa986e..76c5ed9acfd1 100644
> --- a/overlay/gpu-freq.c
> +++ b/overlay/gpu-freq.c
> @@ -33,30 +33,12 @@
>  #include "gpu-freq.h"
>  #include "debugfs.h"
>  
> -static int perf_i915_open(int config, int group)
> -{
> -       struct perf_event_attr attr;
> -
> -       memset(&attr, 0, sizeof (attr));
> -
> -       attr.type = i915_type_id();
> -       if (attr.type == 0)
> -               return -ENOENT;
> -       attr.config = config;
> -
> -       attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
> -       if (group == -1)
> -               attr.read_format |= PERF_FORMAT_GROUP;
> -
> -       return perf_event_open(&attr, -1, 0, group, 0);
> -}
> -
>  static int perf_open(void)
>  {
>         int fd;
>  
> -       fd = perf_i915_open(I915_PERF_ACTUAL_FREQUENCY, -1);
> -       if (perf_i915_open(I915_PERF_REQUESTED_FREQUENCY, fd) < 0) {
> +       fd = perf_i915_open_group(I915_PERF_ACTUAL_FREQUENCY, -1);
> +       if (perf_i915_open_group(I915_PERF_REQUESTED_FREQUENCY, fd) < 0) {
>                 close(fd);
>                 fd = -1;
>         }
> diff --git a/overlay/gpu-top.c b/overlay/gpu-top.c
> index 06f489dfdc83..812f47d5aced 100644
> --- a/overlay/gpu-top.c
> +++ b/overlay/gpu-top.c
> @@ -48,24 +48,6 @@
>  #define I915_PERF_RING_WAIT(n) (__I915_PERF_RING(n) + 1)
>  #define I915_PERF_RING_SEMA(n) (__I915_PERF_RING(n) + 2)
>  
> -static int perf_i915_open(int config, int group)
> -{
> -       struct perf_event_attr attr;
> -
> -       memset(&attr, 0, sizeof (attr));
> -
> -       attr.type = i915_type_id();
> -       if (attr.type == 0)
> -               return -ENOENT;
> -       attr.config = config;
> -
> -       attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
> -       if (group == -1)
> -               attr.read_format |= PERF_FORMAT_GROUP;
> -
> -       return perf_event_open(&attr, -1, 0, group, 0);
> -}
> -
>  static int perf_init(struct gpu_top *gt)
>  {
>         const char *names[] = {
> @@ -77,27 +59,29 @@ static int perf_init(struct gpu_top *gt)
>         };
>         int n;
>  
> -       gt->fd = perf_i915_open(I915_PERF_RING_BUSY(0), -1);
> +       gt->fd = perf_i915_open_group(I915_PERF_RING_BUSY(0), -1);
>         if (gt->fd < 0)
>                 return -1;
>  
> -       if (perf_i915_open(I915_PERF_RING_WAIT(0), gt->fd) >= 0)
> +       if (perf_i915_open_group(I915_PERF_RING_WAIT(0), gt->fd) >= 0)
>                 gt->have_wait = 1;
>  
> -       if (perf_i915_open(I915_PERF_RING_SEMA(0), gt->fd) >= 0)
> +       if (perf_i915_open_group(I915_PERF_RING_SEMA(0), gt->fd) >= 0)
>                 gt->have_sema = 1;
>  
>         gt->ring[0].name = names[0];
>         gt->num_rings = 1;
>  
>         for (n = 1; names[n]; n++) {
> -               if (perf_i915_open(I915_PERF_RING_BUSY(n), gt->fd) >= 0) {
> +               if (perf_i915_open_group(I915_PERF_RING_BUSY(n), gt->fd) >= 0) {
>                         if (gt->have_wait &&
> -                           perf_i915_open(I915_PERF_RING_WAIT(n), gt->fd) < 0)
> +                           perf_i915_open_group(I915_PERF_RING_WAIT(n),
> +                                                gt->fd) < 0)
>                                 return -1;
>  
>                         if (gt->have_sema &&
> -                           perf_i915_open(I915_PERF_RING_SEMA(n), gt->fd) < 0)
> +                           perf_i915_open_group(I915_PERF_RING_SEMA(n),
> +                                                gt->fd) < 0)
>                                 return -1;
>  
>                         gt->ring[gt->num_rings++].name = names[n];
> diff --git a/overlay/power.c b/overlay/power.c
> index 84d860cae40c..dd4aec6bffd9 100644
> --- a/overlay/power.c
> +++ b/overlay/power.c
> @@ -38,21 +38,6 @@
>  
>  /* XXX Is this exposed through RAPL? */
>  
> -static int perf_open(void)
> -{
> -       struct perf_event_attr attr;
> -
> -       memset(&attr, 0, sizeof (attr));
> -
> -       attr.type = i915_type_id();
> -       if (attr.type == 0)
> -               return -1;
> -       attr.config = I915_PERF_ENERGY;
> -
> -       attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
> -       return perf_event_open(&attr, -1, 0, -1, 0);
> -}
> -
>  int power_init(struct power *power)
>  {
>         char buf[4096];
> @@ -60,7 +45,7 @@ int power_init(struct power *power)
>  
>         memset(power, 0, sizeof(*power));
>  
> -       power->fd = perf_open();
> +       power->fd = perf_i915_open(I915_PERF_ENERGY);
>         if (power->fd != -1)
>                 return 0;
>  
> diff --git a/overlay/rc6.c b/overlay/rc6.c
> index 3175bb22308f..46c975a557ff 100644
> --- a/overlay/rc6.c
> +++ b/overlay/rc6.c
> @@ -35,24 +35,6 @@
>  
>  #include "rc6.h"
>  
> -static int perf_i915_open(int config, int group)
> -{
> -       struct perf_event_attr attr;
> -
> -       memset(&attr, 0, sizeof (attr));
> -
> -       attr.type = i915_type_id();
> -       if (attr.type == 0)
> -               return -ENOENT;
> -       attr.config = config;
> -
> -       attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
> -       if (group == -1)
> -               attr.read_format |= PERF_FORMAT_GROUP;
> -
> -       return perf_event_open(&attr, -1, 0, group, 0);
> -}
> -
>  #define RC6    (1<<0)
>  #define RC6p   (1<<1)
>  #define RC6pp  (1<<2)
> @@ -61,15 +43,15 @@ static int perf_open(unsigned *flags)
>  {
>         int fd;
>  
> -       fd = perf_i915_open(I915_PERF_RC6_RESIDENCY, -1);
> +       fd = perf_i915_open_group(I915_PERF_RC6_RESIDENCY, -1);
>         if (fd < 0)
>                 return -1;
>  
>         *flags |= RC6;
> -       if (perf_i915_open(I915_PERF_RC6p_RESIDENCY, fd) >= 0)
> +       if (perf_i915_open_group(I915_PERF_RC6p_RESIDENCY, fd) >= 0)
>                 *flags |= RC6p;
>  
> -       if (perf_i915_open(I915_PERF_RC6pp_RESIDENCY, fd) >= 0)
> +       if (perf_i915_open_group(I915_PERF_RC6pp_RESIDENCY, fd) >= 0)
>                 *flags |= RC6pp;
>  

Ok, sensible consolidation.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU
  2017-09-25 15:14 ` [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU Tvrtko Ursulin
@ 2017-09-25 15:31   ` Chris Wilson
  2017-09-26 10:56     ` Tvrtko Ursulin
  0 siblings, 1 reply; 25+ messages in thread
From: Chris Wilson @ 2017-09-25 15:31 UTC (permalink / raw)
  To: Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-25 16:14:59)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  lib/igt_perf.h           | 93 ++++++++++++++++++++++++++++++++++--------------
>  overlay/gem-interrupts.c |  2 +-
>  overlay/gpu-freq.c       |  4 +--
>  overlay/gpu-top.c        | 68 +++++++++++++++++++----------------
>  overlay/power.c          |  4 +--
>  overlay/rc6.c            |  6 ++--
>  6 files changed, 111 insertions(+), 66 deletions(-)
> 
> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
> index 8e674c3a3755..e29216f0500a 100644
> --- a/lib/igt_perf.h
> +++ b/lib/igt_perf.h
> @@ -1,3 +1,27 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
>  #ifndef I915_PERF_H
>  #define I915_PERF_H
>  
> @@ -5,41 +29,56 @@
>  
>  #include <linux/perf_event.h>
>  
> -#define I915_SAMPLE_BUSY       0
> -#define I915_SAMPLE_WAIT       1
> -#define I915_SAMPLE_SEMA       2
> +enum drm_i915_gem_engine_class {
> +       I915_ENGINE_CLASS_OTHER = 0,
> +       I915_ENGINE_CLASS_RENDER = 1,
> +       I915_ENGINE_CLASS_COPY = 2,
> +       I915_ENGINE_CLASS_VIDEO = 3,
> +       I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
> +       I915_ENGINE_CLASS_MAX /* non-ABI */
> +};
> +
> +enum drm_i915_pmu_engine_sample {
> +       I915_SAMPLE_QUEUED = 0,
> +       I915_SAMPLE_BUSY = 1,
> +       I915_SAMPLE_WAIT = 2,
> +       I915_SAMPLE_SEMA = 3,
> +       I915_ENGINE_SAMPLE_MAX /* non-ABI */
> +};
> +
> +#define I915_PMU_SAMPLE_BITS (4)
> +#define I915_PMU_SAMPLE_MASK (0xf)
> +#define I915_PMU_SAMPLE_INSTANCE_BITS (8)
> +#define I915_PMU_CLASS_SHIFT \
> +       (I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS)
>  
> -#define I915_SAMPLE_RCS                0
> -#define I915_SAMPLE_VCS                1
> -#define I915_SAMPLE_BCS                2
> -#define I915_SAMPLE_VECS       3
> +#define __I915_PMU_ENGINE(class, instance, sample) \
> +       ((class) << I915_PMU_CLASS_SHIFT | \
> +       (instance) << I915_PMU_SAMPLE_BITS | \
> +       (sample))
>  
> -#define __I915_PERF_COUNT(ring, id) ((ring) << 4 | (id))
> +#define I915_PMU_ENGINE_QUEUED(class, instance) \
> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_QUEUED)
>  
> -#define I915_PERF_COUNT_RCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_BUSY)
> -#define I915_PERF_COUNT_RCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_WAIT)
> -#define I915_PERF_COUNT_RCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_SEMA)
> +#define I915_PMU_ENGINE_BUSY(class, instance) \
> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
>  
> -#define I915_PERF_COUNT_VCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_BUSY)
> -#define I915_PERF_COUNT_VCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_WAIT)
> -#define I915_PERF_COUNT_VCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_SEMA)
> +#define I915_PMU_ENGINE_WAIT(class, instance) \
> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
>  
> -#define I915_PERF_COUNT_BCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_BUSY)
> -#define I915_PERF_COUNT_BCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_WAIT)
> -#define I915_PERF_COUNT_BCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_SEMA)
> +#define I915_PMU_ENGINE_SEMA(class, instance) \
> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
>  
> -#define I915_PERF_COUNT_VECS_BUSY __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_BUSY)
> -#define I915_PERF_COUNT_VECS_WAIT __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_WAIT)
> -#define I915_PERF_COUNT_VECS_SEMA __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_SEMA)
> +#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
>  
> -#define I915_PERF_ACTUAL_FREQUENCY 32
> -#define I915_PERF_REQUESTED_FREQUENCY 33
> -#define I915_PERF_ENERGY 34
> -#define I915_PERF_INTERRUPTS 35
> +#define I915_PMU_ACTUAL_FREQUENCY      __I915_PMU_OTHER(0)
> +#define I915_PMU_REQUESTED_FREQUENCY   __I915_PMU_OTHER(1)
> +#define I915_PMU_INTERRUPTS            __I915_PMU_OTHER(2)
> +#define I915_PMU_RC6_RESIDENCY         __I915_PMU_OTHER(3)
> +#define I915_PMU_RC6p_RESIDENCY                __I915_PMU_OTHER(4)
> +#define I915_PMU_RC6pp_RESIDENCY       __I915_PMU_OTHER(5)
>  
> -#define I915_PERF_RC6_RESIDENCY                40
> -#define I915_PERF_RC6p_RESIDENCY       41
> -#define I915_PERF_RC6pp_RESIDENCY      42
> +#define I915_PMU_LAST I915_PMU_RC6pp_RESIDENCY
>  
>  static inline int
>  perf_event_open(struct perf_event_attr *attr,
> diff --git a/overlay/gem-interrupts.c b/overlay/gem-interrupts.c
> index 3eda24f4d7eb..add4a9dfd725 100644
> --- a/overlay/gem-interrupts.c
> +++ b/overlay/gem-interrupts.c
> @@ -113,7 +113,7 @@ int gem_interrupts_init(struct gem_interrupts *irqs)
>  {
>         memset(irqs, 0, sizeof(*irqs));
>  
> -       irqs->fd = perf_i915_open(I915_PERF_INTERRUPTS);
> +       irqs->fd = perf_i915_open(I915_PMU_INTERRUPTS);
>         if (irqs->fd < 0 && interrupts_read() < 0)
>                 irqs->error = ENODEV;
>  
> diff --git a/overlay/gpu-freq.c b/overlay/gpu-freq.c
> index 76c5ed9acfd1..c4619b87242a 100644
> --- a/overlay/gpu-freq.c
> +++ b/overlay/gpu-freq.c
> @@ -37,8 +37,8 @@ static int perf_open(void)
>  {
>         int fd;
>  
> -       fd = perf_i915_open_group(I915_PERF_ACTUAL_FREQUENCY, -1);
> -       if (perf_i915_open_group(I915_PERF_REQUESTED_FREQUENCY, fd) < 0) {
> +       fd = perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, -1);
> +       if (perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, fd) < 0) {
>                 close(fd);
>                 fd = -1;
>         }
> diff --git a/overlay/gpu-top.c b/overlay/gpu-top.c
> index 812f47d5aced..61b8f62fd78c 100644
> --- a/overlay/gpu-top.c
> +++ b/overlay/gpu-top.c
> @@ -43,49 +43,57 @@
>  #define   RING_WAIT            (1<<11)
>  #define   RING_WAIT_SEMAPHORE  (1<<10)
>  
> -#define __I915_PERF_RING(n) (4*n)
> -#define I915_PERF_RING_BUSY(n) (__I915_PERF_RING(n) + 0)
> -#define I915_PERF_RING_WAIT(n) (__I915_PERF_RING(n) + 1)
> -#define I915_PERF_RING_SEMA(n) (__I915_PERF_RING(n) + 2)
> -
>  static int perf_init(struct gpu_top *gt)
>  {
> -       const char *names[] = {
> -               "RCS",
> -               "BCS",
> -               "VCS0",
> -               "VCS1",
> -               NULL,
> +       struct engine_desc {
> +               unsigned class, inst;
> +               const char *name;
> +       } *d, engines[] = {
> +               { I915_ENGINE_CLASS_RENDER, 0, "rcs0" },
> +               { I915_ENGINE_CLASS_COPY, 0, "bcs0" },
> +               { I915_ENGINE_CLASS_VIDEO, 0, "vcs0" },
> +               { I915_ENGINE_CLASS_VIDEO, 1, "vcs1" },
> +               { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, "vecs0" },

Hmm, there is some hidden coupling with colours atm, but other than that
the order is flexible, iirc.

> +               { 0, 0, NULL }
>         };
> -       int n;
>  
> -       gt->fd = perf_i915_open_group(I915_PERF_RING_BUSY(0), -1);
> +       d = &engines[0];
> +
> +       gt->fd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class, d->inst),
> +                                     -1);
>         if (gt->fd < 0)
>                 return -1;
>  
> -       if (perf_i915_open_group(I915_PERF_RING_WAIT(0), gt->fd) >= 0)
> +       if (perf_i915_open_group(I915_PMU_ENGINE_WAIT(d->class, d->inst),
> +                                gt->fd) >= 0)
>                 gt->have_wait = 1;
>  
> -       if (perf_i915_open_group(I915_PERF_RING_SEMA(0), gt->fd) >= 0)
> +       if (perf_i915_open_group(I915_PMU_ENGINE_SEMA(d->class, d->inst),
> +                                gt->fd) >= 0)
>                 gt->have_sema = 1;
>  
> -       gt->ring[0].name = names[0];
> +       gt->ring[0].name = d->name;
>         gt->num_rings = 1;
>  
> -       for (n = 1; names[n]; n++) {
> -               if (perf_i915_open_group(I915_PERF_RING_BUSY(n), gt->fd) >= 0) {
> -                       if (gt->have_wait &&
> -                           perf_i915_open_group(I915_PERF_RING_WAIT(n),
> -                                                gt->fd) < 0)
> -                               return -1;
> -
> -                       if (gt->have_sema &&
> -                           perf_i915_open_group(I915_PERF_RING_SEMA(n),
> -                                                gt->fd) < 0)
> -                               return -1;
> -
> -                       gt->ring[gt->num_rings++].name = names[n];
> -               }
> +       for (d++; d->name; d++) {
> +               if (perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
> +                                                             d->inst),
> +                                       gt->fd) < 0)
> +                       continue;
> +
> +               if (gt->have_wait &&
> +                   perf_i915_open_group(I915_PMU_ENGINE_WAIT(d->class,
> +                                                             d->inst),
> +                                        gt->fd) < 0)
> +                       return -1;
> +
> +               if (gt->have_sema &&
> +                   perf_i915_open_group(I915_PMU_ENGINE_SEMA(d->class,
> +                                                             d->inst),
> +                                  gt->fd) < 0)
> +                       return -1;
> +
> +               gt->ring[gt->num_rings++].name = d->name;
>         }
>  
>         return 0;
> diff --git a/overlay/power.c b/overlay/power.c
> index dd4aec6bffd9..805f4ca7805c 100644
> --- a/overlay/power.c
> +++ b/overlay/power.c
> @@ -45,9 +45,7 @@ int power_init(struct power *power)
>  
>         memset(power, 0, sizeof(*power));
>  
> -       power->fd = perf_i915_open(I915_PERF_ENERGY);
> -       if (power->fd != -1)
> -               return 0;
> +       power->fd = -1;

Hmm, didn't you say that the rapl values were exposed via perf as well?
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* ✓ Fi.CI.BAT: success for IGT PMU support (rev4)
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
                   ` (6 preceding siblings ...)
  2017-09-25 15:15 ` [PATCH i-g-t 7/7] media-bench.pl: Add busy balancers to the list Tvrtko Ursulin
@ 2017-09-25 16:06 ` Patchwork
  2017-09-25 22:16 ` ✓ Fi.CI.IGT: " Patchwork
  8 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2017-09-25 16:06 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

== Series Details ==

Series: IGT PMU support (rev4)
URL   : https://patchwork.freedesktop.org/series/28253/
State : success

== Summary ==

IGT patchset tested on top of latest successful build
c117213c06d0f47937c1f225ebead5e1fe8c7a0e igt/gem_exec_whisper: Smoketest context priorities

with latest DRM-Tip kernel build CI_DRM_3130
0b6507738260 drm-tip: 2017y-09m-25d-13h-55m-18s UTC integration manifest

Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                dmesg-warn -> PASS       (fi-skl-6770hq)
Test pm_rpm:
        Subgroup basic-rte:
                dmesg-warn -> PASS       (fi-cfl-s) fdo#102294

fdo#102294 https://bugs.freedesktop.org/show_bug.cgi?id=102294

fi-bdw-5557u     total:289  pass:268  dwarn:0   dfail:0   fail:0   skip:21  time:453s
fi-bdw-gvtdvm    total:289  pass:265  dwarn:0   dfail:0   fail:0   skip:24  time:471s
fi-blb-e6850     total:289  pass:224  dwarn:1   dfail:0   fail:0   skip:64  time:420s
fi-bsw-n3050     total:289  pass:243  dwarn:0   dfail:0   fail:0   skip:46  time:517s
fi-bwr-2160      total:289  pass:184  dwarn:0   dfail:0   fail:0   skip:105 time:280s
fi-bxt-j4205     total:289  pass:260  dwarn:0   dfail:0   fail:0   skip:29  time:500s
fi-byt-j1900     total:289  pass:254  dwarn:1   dfail:0   fail:0   skip:34  time:495s
fi-byt-n2820     total:289  pass:250  dwarn:1   dfail:0   fail:0   skip:38  time:501s
fi-cfl-s         total:289  pass:223  dwarn:34  dfail:0   fail:0   skip:32  time:543s
fi-cnl-y         total:289  pass:256  dwarn:0   dfail:0   fail:6   skip:27  time:658s
fi-elk-e7500     total:289  pass:230  dwarn:0   dfail:0   fail:0   skip:59  time:418s
fi-glk-1         total:289  pass:260  dwarn:0   dfail:0   fail:0   skip:29  time:566s
fi-hsw-4770      total:289  pass:263  dwarn:0   dfail:0   fail:0   skip:26  time:425s
fi-hsw-4770r     total:289  pass:263  dwarn:0   dfail:0   fail:0   skip:26  time:405s
fi-ilk-650       total:289  pass:229  dwarn:0   dfail:0   fail:0   skip:60  time:435s
fi-ivb-3520m     total:289  pass:261  dwarn:0   dfail:0   fail:0   skip:28  time:485s
fi-ivb-3770      total:289  pass:261  dwarn:0   dfail:0   fail:0   skip:28  time:469s
fi-kbl-7500u     total:289  pass:264  dwarn:1   dfail:0   fail:0   skip:24  time:474s
fi-kbl-7560u     total:289  pass:270  dwarn:0   dfail:0   fail:0   skip:19  time:575s
fi-kbl-r         total:289  pass:262  dwarn:0   dfail:0   fail:0   skip:27  time:593s
fi-pnv-d510      total:289  pass:223  dwarn:1   dfail:0   fail:0   skip:65  time:542s
fi-skl-6260u     total:289  pass:269  dwarn:0   dfail:0   fail:0   skip:20  time:452s
fi-skl-6700k     total:289  pass:265  dwarn:0   dfail:0   fail:0   skip:24  time:751s
fi-skl-6770hq    total:289  pass:269  dwarn:0   dfail:0   fail:0   skip:20  time:492s
fi-skl-gvtdvm    total:289  pass:266  dwarn:0   dfail:0   fail:0   skip:23  time:474s
fi-snb-2520m     total:289  pass:251  dwarn:0   dfail:0   fail:0   skip:38  time:570s
fi-snb-2600      total:289  pass:250  dwarn:0   dfail:0   fail:0   skip:39  time:419s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_248/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API
  2017-09-25 15:15 ` [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API Tvrtko Ursulin
@ 2017-09-25 16:21   ` Chris Wilson
  2017-09-26 11:19     ` Tvrtko Ursulin
  0 siblings, 1 reply; 25+ messages in thread
From: Chris Wilson @ 2017-09-25 16:21 UTC (permalink / raw)
  To: Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-25 16:15:00)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> A bunch of tests for the new i915 PMU feature.
> 
> Parts of the code were initialy sketched by Dmitry Rogozhkin.
> 
> v2: (Most suggestions by Chris Wilson)
>  * Add new class/instance based engine list.
>  * Add gem_has_engine/gem_require_engine to work with class/instance.
>  * Use the above two throughout the test.
>  * Shorten tests to 100ms busy batches, seems enough.
>  * Add queued counter sanity checks.
>  * Use igt_nsec_elapsed.
>  * Skip on perf -ENODEV in some tests instead of embedding knowledge locally.
>  * Fix multi ordering for busy accounting.
>  * Use new guranteed_usleep when sleep time is asserted on.
>  * Check for no queued when idle/busy.
>  * Add queued counter init test.
>  * Add queued tests.
>  * Consolidate and increase multiple busy engines tests to most-busy and
>    all-busy tests.
>  * Guarantte interrupts by using fences.
>  * Test RC6 via forcewake.
> 
> v3:
>  * Tweak assert in interrupts subtest.
>  * Sprinkle of comments.
>  * Fix multi-client test which got broken in v2.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
> ---
>  lib/igt_gt.c           |  50 +++
>  lib/igt_gt.h           |  38 +++
>  lib/igt_perf.h         |   9 +-
>  tests/Makefile.sources |   1 +
>  tests/perf_pmu.c       | 869 +++++++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 959 insertions(+), 8 deletions(-)
>  create mode 100644 tests/perf_pmu.c
> 
> diff --git a/lib/igt_gt.c b/lib/igt_gt.c
> index b3f3b3809eee..4c75811fb1b3 100644
> --- a/lib/igt_gt.c
> +++ b/lib/igt_gt.c
> @@ -568,3 +568,53 @@ bool gem_can_store_dword(int fd, unsigned int engine)
>  
>         return true;
>  }
> +

> +const struct intel_execution_engine2 intel_execution_engines2[] = {

If you want to do a quick
	s/intel_execution_engine/intel_execution_legacy_ring/
or wait until class/inst is the natural execbuf interface?

> +const double tolerance = 0.02f;
> +const unsigned long batch_duration_ns = 100 * 1000 * 1000;
> +
> +static int open_pmu(uint64_t config)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open(config);
> +       igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
> +       igt_assert(fd >= 0);

So why are we not allowed to disable perf counters?

> +       return fd;
> +}
> +
> +static int open_group(uint64_t config, int group)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open_group(config, group);
> +       igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
> +       igt_assert(fd >= 0);
> +
> +       return fd;
> +}
> +
> +static void
> +init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample)
> +{
> +       int fd;
> +
> +       fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
> +
> +       close(fd);
> +}
> +
> +static uint64_t pmu_read_single(int fd)
> +{
> +       uint64_t data[2];
> +       ssize_t len;
> +
> +       len = read(fd, data, sizeof(data));
> +       igt_assert_eq(len, sizeof(data));

read() isn't a complicated macro, so won't this have a reasonable
string as igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)) ?

> +       return data[0];
> +}
> +
> +static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
> +{
> +       uint64_t buf[2 + num];
> +       unsigned int i;
> +       ssize_t len;
> +
> +       len = read(fd, buf, sizeof(buf));
> +       igt_assert_eq(len, sizeof(buf));
> +       for (i = 0; i < num; i++)
> +               val[i] = buf[2 + i];
> +}
> +
> +#define assert_within_epsilon(x, ref, tolerance) \
> +       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
> +                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
> +                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
> +                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
> +
> +/*
> + * Helper for cases where we assert on time spent sleeping (directly or
> + * indirectly), so make it more robust by ensuring the system sleep time
> + * is within test tolerance to start with.
> + */

Looking at the callers, I don't see where we need a guaranteed sleep vs
a known sleep. We are comparing elapsed/wall time with a perf counter, so
we only care that they match. By asserting we just cause random fails if
the system decides not to wake up for a 1s (outside the purvey of the
test).

> +static void guaranteed_usleep(unsigned int usec)
> +{
> +       uint64_t slept = 0, to_sleep = usec;
> +
> +       while (usec > 0) {
> +               struct timespec start = { };
> +               uint64_t this_sleep;
> +
> +               igt_nsec_elapsed(&start);
> +               usleep(usec);
> +               this_sleep = igt_nsec_elapsed(&start) / 1000;
> +               slept += this_sleep;
> +               if (this_sleep > usec)
> +                       break;
> +               usec -= this_sleep;
> +       }
> +
> +       assert_within_epsilon(slept, to_sleep, tolerance);
> +}
> +
> +static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       return gem_class_instance_to_eb_flags(gem_fd, e->class, e->instance);
> +}
> +
> +static void
> +single(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample,
> +       bool busy)
> +{
> +       double ref = busy && sample == I915_SAMPLE_BUSY ?
> +                    batch_duration_ns : 0.0f;
> +       igt_spin_t *spin;
> +       uint64_t val;
> +       int fd;
> +
> +       fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
> +
> +       if (busy) {
> +               spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +       } else {
> +               guaranteed_usleep(batch_duration_ns / 1000);
> +       }
> +
> +       if (busy)
> +               gem_sync(gem_fd, spin->handle);
> +
> +       val = pmu_read_single(fd);
> +
> +       assert_within_epsilon(val, ref, tolerance);
> +
> +       if (busy)
> +               igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static void
> +queued(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       igt_spin_t *spin[2];
> +       uint64_t val;
> +       int fd;
> +
> +       fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
> +
> +       /*
> +        * First spin batch will be immediately executing.
> +        */
> +       spin[0] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin[0], batch_duration_ns);
> +
> +       /*
> +        * Second spin batch will sit in the execution queue behind the
> +        * first one so must cause the PMU to correctly report the queued
> +        * counter.
> +        */

Hmm, the second to the same ctx should be executed. We will see that
port[1] is empty and so ask if we can fit the batch into port[1]. In
fact, it turns out that we can coalesce the batch with port[0].
Once coalesced, we can't distinguish the two requests since they are a
single context in the ELSP.

All this is presuming execlists...

iirc execbuf-ioctl() -> [untracked]
  \- submit_request -> QUEUED
      \- execlists_dequeue -> BUSY


> +       spin[1] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin[1], batch_duration_ns);
> +
> +       gem_sync(gem_fd, spin[0]->handle);
> +
> +       val = pmu_read_single(fd);
> +       assert_within_epsilon(val, batch_duration_ns, tolerance);
> +
> +       gem_sync(gem_fd, spin[1]->handle);
> +
> +       igt_spin_batch_free(gem_fd, spin[0]);
> +       igt_spin_batch_free(gem_fd, spin[1]);
> +       close(fd);
> +}
> +
> +static void
> +busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> +              const unsigned int num_engines)
> +{
> +       const struct intel_execution_engine2 *e_;
> +       uint64_t val[num_engines];
> +       int fd[num_engines];
> +       igt_spin_t *spin;
> +       unsigned int busy_idx, i;
> +
> +       i = 0;
> +       fd[0] = -1;
> +       for_each_engine_class_instance(fd, e_) {
> +               if (!gem_has_engine(gem_fd, e_->class, e_->instance))
> +                       continue;
> +               else if (e == e_)
> +                       busy_idx = i;
> +
> +               fd[i++] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
> +                                                         e_->instance),
> +                                    fd[0]);
> +       }
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +
> +       gem_sync(gem_fd, spin->handle);
> +
> +       pmu_read_multi(fd[0], num_engines, val);
> +
> +       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
> +       for (i = 0; i < num_engines; i++) {

double expected;

if (i == busy_idx)
	expected = batch_duration_ns;
else
	expected = 0;
assert_within_epsilon(val[i], expected, tolerance);

Probably worth a preceding

for (i = 0; i < num_engines; i++)
	len += sprintf(buf + len, "%s%s=%f, ", i == busy_idx ? "*" : "", name[i], val[i]);
buf[len-2] = '\0;'
igt_info("%s\n", buf);

How about batches executing from a different context, or submitted from
a different fd?

> +               if (i == busy_idx)
> +                       continue;
> +               assert_within_epsilon(val[i], 0.0f, tolerance);
> +       }
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd[0]);
> +}
> +
> +static void
> +most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> +                   const unsigned int num_engines)
> +{
> +       const struct intel_execution_engine2 *e_;
> +       uint64_t val[num_engines];
> +       int fd[num_engines];
> +       igt_spin_t *spin[num_engines];
> +       unsigned int idle_idx, i;
> +
> +       gem_require_engine(gem_fd, e->class, e->instance);
> +
> +       i = 0;
> +       fd[0] = -1;
> +       for_each_engine_class_instance(fd, e_) {
> +               if (!gem_has_engine(gem_fd, e_->class, e_->instance))
> +                       continue;
> +
> +               fd[i] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
> +                                                       e_->instance),
> +                                  fd[0]);
> +
> +               if (e == e_) {
> +                       idle_idx = i;
> +               } else {
> +                       spin[i] = igt_spin_batch_new(gem_fd, 0,
> +                                                    e2ring(gem_fd, e_), 0);
> +                       igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
> +               }
> +
> +               i++;
> +       }
> +
> +       for (i = 0; i < num_engines; i++) {
> +               if (i != idle_idx)
> +                       gem_sync(gem_fd, spin[i]->handle);
> +       }
> +
> +       pmu_read_multi(fd[0], num_engines, val);
> +
> +       for (i = 0; i < num_engines; i++) {
> +               if (i == idle_idx)
> +                       assert_within_epsilon(val[i], 0.0f, tolerance);
> +               else
> +                       assert_within_epsilon(val[i], batch_duration_ns,
> +                                             tolerance);
> +       }
> +
> +       for (i = 0; i < num_engines; i++) {
> +               if (i != idle_idx)
> +                       igt_spin_batch_free(gem_fd, spin[i]);
> +       }
> +       close(fd[0]);
> +}
> +
> +static void
> +all_busy_check_all(int gem_fd, const unsigned int num_engines)
> +{
> +       const struct intel_execution_engine2 *e;
> +       uint64_t val[num_engines];
> +       int fd[num_engines];
> +       igt_spin_t *spin[num_engines];
> +       unsigned int i;
> +
> +       i = 0;
> +       fd[0] = -1;
> +       for_each_engine_class_instance(fd, e) {
> +               if (!gem_has_engine(gem_fd, e->class, e->instance))
> +                       continue;
> +
> +               fd[i] = open_group(I915_PMU_ENGINE_BUSY(e->class, e->instance),
> +                                  fd[0]);
> +
> +               spin[i] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +               igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
> +
> +               i++;
> +       }
> +
> +       for (i = 0; i < num_engines; i++)
> +               gem_sync(gem_fd, spin[i]->handle);
> +
> +       pmu_read_multi(fd[0], num_engines, val);
> +
> +       for (i = 0; i < num_engines; i++)
> +               assert_within_epsilon(val[i], batch_duration_ns, tolerance);
> +
> +       for (i = 0; i < num_engines; i++)
> +               igt_spin_batch_free(gem_fd, spin[i]);
> +       close(fd[0]);
> +}
> +
> +static void
> +no_sema(int gem_fd, const struct intel_execution_engine2 *e, bool busy)
> +{
> +       igt_spin_t *spin;
> +       uint64_t val[2];
> +       int fd;
> +
> +       fd = open_group(I915_PMU_ENGINE_SEMA(e->class, e->instance), -1);
> +       open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
> +
> +       if (busy) {
> +               spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);

These busy paths are not serialised to the batch, it is quite possible
for the pmu_read to happen before the spin is even scheduled for
execution on the hw.

> +       } else {
> +               usleep(batch_duration_ns / 1000);
> +       }
> +
> +       pmu_read_multi(fd, 2, val);
> +
> +       assert_within_epsilon(val[0], 0.0f, tolerance);
> +       assert_within_epsilon(val[1], 0.0f, tolerance);
> +
> +       if (busy)
> +               igt_spin_batch_free(gem_fd, spin);
> +       close(fd);

I'm still at a loss as to why this test is interesting. You want to say
that given an idle engine with a single batch there is no time spent
waiting for an MI event, nor waiting on a semaphore. Where is that
guaranteed? What happens if we do need to do such a pre-batch + sync in
the future. If you wanted to say that whilst the batch is executing that
no time is spent processing requests the batch isn't making, then maybe.
But without a test to demonstrate the opposite, we still have no idea if
the counters work.

> +}
> +
> +static void
> +multi_client(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
> +       igt_spin_t *spin;
> +       uint64_t val[2];
> +       int fd[2];
> +
> +       fd[0] = open_pmu(config);
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +
> +       guaranteed_usleep(batch_duration_ns / 3000);
> +
> +       /*
> +        * Second PMU client which is initialized after the first one,
> +        * and exists before it, should not affect accounting as reported
> +        * in the first client.
> +        */
> +       fd[1] = open_pmu(config);
> +       guaranteed_usleep(batch_duration_ns / 3000);
> +       val[1] = pmu_read_single(fd[1]);
> +       close(fd[1]);
> +
> +       gem_sync(gem_fd, spin->handle);
> +
> +       val[0] = pmu_read_single(fd[0]);
> +
> +       assert_within_epsilon(val[0], batch_duration_ns, tolerance);
> +       assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd[0]);
> +}
> +
> +/**
> + * Tests that i915 PMU corectly errors out in invalid initialization.
> + * i915 PMU is uncore PMU, thus:
> + *  - sampling period is not supported
> + *  - pid > 0 is not supported since we can't count per-process (we count
> + *    per whole system)
> + *  - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0
> + */
> +static void invalid_init(void)
> +{
> +       struct perf_event_attr attr;
> +       int pid, cpu;
> +
> +#define ATTR_INIT() \
> +do { \
> +       memset(&attr, 0, sizeof (attr)); \
> +       attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \
> +       attr.type = i915_type_id(); \
> +       igt_assert(attr.type != 0); \
> +} while(0)
> +
> +       ATTR_INIT();
> +       attr.sample_period = 100;
> +       pid = -1;
> +       cpu = 0;
> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
> +       igt_assert_eq(errno, EINVAL);
> +
> +       ATTR_INIT();
> +       pid = 0;
> +       cpu = 0;
> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
> +       igt_assert_eq(errno, EINVAL);
> +
> +       ATTR_INIT();
> +       pid = -1;
> +       cpu = 1;
> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
> +       igt_assert_eq(errno, ENODEV);
> +}
> +
> +static void init_other(unsigned int i, bool valid)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open(__I915_PMU_OTHER(i));
> +       igt_require(!(fd < 0 && errno == ENODEV));

Make perf_i915_open return -errno. It simplifies checks so much.

> +       if (valid) {
> +               igt_assert(fd >= 0);
> +       } else {
> +               igt_assert(fd < 0);
> +               return;
> +       }
> +
> +       close(fd);
> +}
> +
> +static void read_other(unsigned int i, bool valid)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open(__I915_PMU_OTHER(i));
> +       igt_require(!(fd < 0 && errno == ENODEV));
> +       if (valid) {
> +               igt_assert(fd >= 0);
> +       } else {
> +               igt_assert(fd < 0);
> +               return;
> +       }
> +
> +       (void)pmu_read_single(fd);
> +
> +       close(fd);
> +}
> +
> +static bool cpu0_hotplug_support(void)
> +{
> +       int fd = open("/sys/devices/system/cpu/cpu0/online", O_WRONLY);

access("..", W_OK);

> +
> +       close(fd);
> +
> +       return fd > 0;
> +}
> +
> +static void cpu_hotplug(int gem_fd)
> +{
> +       struct timespec start = { };
> +       igt_spin_t *spin;
> +       uint64_t val, ref;
> +       int fd;
> +
> +       igt_require(cpu0_hotplug_support());
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
> +       igt_assert(fd >= 0);
> +
> +       igt_nsec_elapsed(&start);
> +
> +       /*
> +        * Toggle online status of all the CPUs in a child process and ensure
> +        * this has not affected busyness stats in the parent.
> +        */
> +       igt_fork(child, 1) {
> +               int cpu = 0;
> +
> +               for (;;) {
> +                       char name[128];
> +                       int cpufd;
> +
> +                       sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
> +                               cpu);
> +                       cpufd = open(name, O_WRONLY);
> +                       if (cpufd == -1) {
> +                               igt_assert(cpu > 0);
> +                               break;
> +                       }
> +                       igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +                       usleep(1000 * 1000);
> +
> +                       igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +                       close(cpufd);
> +                       cpu++;
> +               }
> +       }
> +
> +       igt_waitchildren();
> +
> +       igt_spin_batch_end(spin);
> +       gem_sync(gem_fd, spin->handle);
> +
> +       ref = igt_nsec_elapsed(&start);
> +       val = pmu_read_single(fd);
> +
> +       assert_within_epsilon(val, ref, tolerance);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static int chain_nop(int gem_fd, int in_fence, bool sync)
> +{
> +       struct drm_i915_gem_exec_object2 obj = {};
> +       struct drm_i915_gem_execbuffer2 eb =
> +               { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
> +       const uint32_t bbe = 0xa << 23;
> +
> +       obj.handle = gem_create(gem_fd, sizeof(bbe));
> +       gem_write(gem_fd, obj.handle, 0, &bbe, sizeof(bbe));
> +
> +       eb.flags = I915_EXEC_RENDER | I915_EXEC_FENCE_OUT;
> +
> +       if (in_fence >= 0) {
> +               eb.flags |= I915_EXEC_FENCE_IN;
> +               eb.rsvd2 = in_fence;
> +       }
> +
> +       gem_execbuf_wr(gem_fd, &eb);
> +
> +       if (sync)
> +               gem_sync(gem_fd, obj.handle);
> +
> +       gem_close(gem_fd, obj.handle);
> +       if (in_fence >= 0)
> +               close(in_fence);
> +
> +       return eb.rsvd2 >> 32;
> +}
> +
> +static void
> +test_interrupts(int gem_fd)
> +{
> +       uint64_t idle, busy, prev;
> +       int fd, fence = -1;
> +       const unsigned int count = 1000;
> +       unsigned int i;
> +
> +       fd = open_pmu(I915_PMU_INTERRUPTS);
> +
> +       gem_quiescent_gpu(gem_fd);
> +
> +       /* Wait for idle state. */
> +       prev = pmu_read_single(fd);
> +       idle = prev + 1;
> +       while (idle != prev) {
> +               usleep(batch_duration_ns / 1000);
> +               prev = idle;
> +               idle = pmu_read_single(fd);
> +       }
> +
> +       igt_assert_eq(idle - prev, 0);
> +
> +       /* Send some no-op batches with chained fences to ensure interrupts. */

Not all kernels would emit interrupts for this. Remember we used to spin
on the in-fence to avoid the interrupt setup overhead. And there's no
reason to assume we wouldn't again if there was a demonstrable
advantage.

You've assumed execlists elsewhere, so why not generate context-switch
interrupts instead? I presume you did try MI_USER_INTERRUPT from a user
batch? iirc it should still work.

> +       for (i = 1; i <= count; i++)
> +               fence = chain_nop(gem_fd, fence, i < count ? false : true);
> +       close(fence);
> +
> +       /* Check at least as many interrupts has been generated. */
> +       busy = pmu_read_single(fd);
> +       igt_assert(busy > count / 20);
> +
> +       close(fd);
> +}
> +
> +static void
> +test_frequency(int gem_fd)
> +{
> +       igt_spin_t *spin;
> +       uint64_t idle[2], busy[2];
> +       int fd;
> +
> +       fd = open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
> +       open_group(I915_PMU_ACTUAL_FREQUENCY, fd);
> +
> +       gem_quiescent_gpu(gem_fd);
> +       usleep(batch_duration_ns / 1000);
> +
> +       pmu_read_multi(fd, 2, idle);
> +
> +       /*
> +        * Put some load on the render engine and check that the requenst
> +        * and actual frequencies have gone up.
> +        *
> +        * FIXME This might not be guaranteed to work on all platforms.
> +        * How to detect those?
> +        */

I don't think we can make any guarantees about freq, except the
softlimits imposed via sysfs.

So use that.

Create a spin batch, set min/max to low. Measure. Set min/max to high, sample
again. The results should follow the values set via sysfs.

> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +       gem_sync(gem_fd, spin->handle);
> +
> +       pmu_read_multi(fd, 2, busy);
> +
> +       igt_assert(busy[0] > idle[0]);
> +       igt_assert(busy[1] > idle[1]);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static void
> +test_rc6(int gem_fd)
> +{
> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
> +       uint64_t idle, busy, prev;
> +       int fd, fw;
> +
> +       fd = open_pmu(I915_PMU_RC6_RESIDENCY);
> +
> +       gem_quiescent_gpu(gem_fd);
> +
> +       /* Go idle and check full RC6. */
> +       prev = pmu_read_single(fd);
> +       guaranteed_usleep(duration_ns / 1000);
> +       idle = pmu_read_single(fd);
> +
> +       assert_within_epsilon(idle - prev, duration_ns, tolerance);
> +
> +       /* Wake up device and check no RC6. */
> +       fw = igt_open_forcewake_handle(gem_fd);
> +       igt_assert(fw >= 0);
> +
> +       prev = pmu_read_single(fd);
> +       guaranteed_usleep(duration_ns / 1000);
> +       busy = pmu_read_single(fd);
> +
> +       assert_within_epsilon(busy - prev, 0.0, tolerance);
> +
> +       close(fw);
> +       close(fd);

Ok (other than the guaranteed vs known sleep).

> +}
> +
> +static void
> +test_rc6p(int gem_fd)
> +{
> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
> +       unsigned int num_pmu = 1;
> +       uint64_t idle[3], busy[3], prev[3];
> +       unsigned int i;
> +       int fd, ret, fw;
> +
> +       fd = open_group(I915_PMU_RC6_RESIDENCY, -1);
> +       ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
> +       if (ret > 0) {
> +               num_pmu++;
> +               ret = perf_i915_open_group(I915_PMU_RC6pp_RESIDENCY, fd);
> +               if (ret > 0)
> +                       num_pmu++;
> +       }
> +
> +       igt_require(num_pmu == 3);

Do we expose all RC6 counters, even on hw that doesn't support them all?
We pretty much only expect to have an rc6 counter (from snb+)

> +
> +       gem_quiescent_gpu(gem_fd);
> +
> +       /* Go idle and check full RC6. */
> +       pmu_read_multi(fd, num_pmu, prev);
> +       guaranteed_usleep(duration_ns / 1000);
> +       pmu_read_multi(fd, num_pmu, idle);
> +
> +       for (i = 0; i < num_pmu; i++)
> +               assert_within_epsilon(idle[i] - prev[i], duration_ns,
> +                                     tolerance);
> +
> +       /* Wake up device and check no RC6. */
> +       fw = igt_open_forcewake_handle(gem_fd);
> +       igt_assert(fw >= 0);
> +
> +       pmu_read_multi(fd, num_pmu, prev);
> +       guaranteed_usleep(duration_ns / 1000);
> +       pmu_read_multi(fd, num_pmu, busy);
> +
> +       for (i = 0; i < num_pmu; i++)
> +               assert_within_epsilon(busy[i] - prev[i], 0.0, tolerance);
> +
> +       close(fw);
> +       close(fd);
> +}
> +
> +igt_main
> +{
> +       const unsigned int num_other_metrics =
> +                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
> +       unsigned int num_engines = 0;
> +       int fd = -1;
> +       const struct intel_execution_engine2 *e;
> +       unsigned int i;
> +
> +       igt_fixture {
> +               fd = drm_open_driver_master(DRIVER_INTEL);

master? For what? I didn't see any display interactions, or secure
dispatch. Does this mean that we only have perf counters for the master?
Shouldn't we also be checking that we can capture rendernodes as well as
authed fd?
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers
  2017-09-25 15:15 ` [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers Tvrtko Ursulin
@ 2017-09-25 16:44   ` Chris Wilson
  2017-09-26 11:27     ` Tvrtko Ursulin
  0 siblings, 1 reply; 25+ messages in thread
From: Chris Wilson @ 2017-09-25 16:44 UTC (permalink / raw)
  To: Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-25 16:15:01)
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Add busy and busy-avg balancers which make balancing
> decisions by looking at engine busyness via the i915 PMU.

"And thus are able to make decisions on the actual instantaneous load of
the system, and not use metrics that lag behind by a batch or two. In
doing so, each client should be able to greedily maximise their own
usage of the system, leading to improved load balancing even in the face
of other uncooperative clients. On the other hand, we are only using the
instantaneous load without coupling in the predictive factor for
dispatch and execution length."

Hmm, do you not want to sum busy + queued? Or at least compare! :)

> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  benchmarks/gem_wsim.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 140 insertions(+)
> 
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index 82fe6ba9ec5f..9ee91fabb220 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -50,6 +50,7 @@
>  #include "intel_io.h"
>  #include "igt_aux.h"
>  #include "igt_rand.h"
> +#include "igt_perf.h"
>  #include "sw_sync.h"
>  
>  #include "ewma.h"
> @@ -188,6 +189,16 @@ struct workload
>                         uint32_t last[NUM_ENGINES];
>                 } rt;
>         };
> +
> +       struct busy_balancer {
> +               int fd;
> +               bool first;
> +               unsigned int num_engines;
> +               unsigned int engine_map[5];
> +               uint64_t t_prev;
> +               uint64_t prev[5];
> +               double busy[5];
> +       } busy_balancer;
>  };
>  
>  static const unsigned int nop_calibration_us = 1000;
> @@ -993,6 +1004,8 @@ struct workload_balancer {
>         unsigned int flags;
>         unsigned int min_gen;
>  
> +       int (*init)(const struct workload_balancer *balancer,
> +                   struct workload *wrk);
>         unsigned int (*get_qd)(const struct workload_balancer *balancer,
>                                struct workload *wrk,
>                                enum intel_engine_id engine);
> @@ -1242,6 +1255,106 @@ context_balance(const struct workload_balancer *balancer,
>         return get_vcs_engine(wrk->ctx_list[w->context].static_vcs);
>  }
>  
> +static unsigned int
> +get_engine_busy(const struct workload_balancer *balancer,
> +               struct workload *wrk, enum intel_engine_id engine)
> +{
> +       struct busy_balancer *bb = &wrk->busy_balancer;
> +
> +       if (engine == VCS2 && (wrk->flags & VCS2REMAP))
> +               engine = BCS;
> +
> +       return bb->busy[bb->engine_map[engine]];
> +}
> +
> +static void get_stats(const struct workload_balancer *b, struct workload *wrk)

s/get_stats/get_pmu_stats/ ?

> +{
> +       struct busy_balancer *bb = &wrk->busy_balancer;
> +       uint64_t val[7];
> +       unsigned int i;
> +
> +       igt_assert_eq(read(bb->fd, val, sizeof(val)), sizeof(val));
> +
> +       if (!bb->first) {
> +               for (i = 0; i < bb->num_engines; i++) {
> +                       double d;
> +
> +                       d = (val[2 + i] - bb->prev[i]) * 100;
> +                       d /= val[1] - bb->t_prev;
> +                       bb->busy[i] = d;
> +               }
> +       }
> +
> +       for (i = 0; i < bb->num_engines; i++)
> +               bb->prev[i] = val[2 + i];
> +
> +       bb->t_prev = val[1];
> +       bb->first = false;

Ok.

> +}
> +
> +static enum intel_engine_id
> +busy_avg_balance(const struct workload_balancer *balancer,
> +                struct workload *wrk, struct w_step *w)
> +{
> +       get_stats(balancer, wrk);
> +
> +       return qdavg_balance(balancer, wrk, w);
> +}
> +
> +static enum intel_engine_id
> +busy_balance(const struct workload_balancer *balancer,
> +            struct workload *wrk, struct w_step *w)
> +{
> +       get_stats(balancer, wrk);
> +
> +       return qd_balance(balancer, wrk, w);
> +}
> +
> +static int
> +busy_init(const struct workload_balancer *balancer, struct workload *wrk)
> +{
> +       struct busy_balancer *bb = &wrk->busy_balancer;
> +       struct engine_desc {
> +               unsigned class, inst;
> +               enum intel_engine_id id;
> +       } *d, engines[] = {
> +               { I915_ENGINE_CLASS_RENDER, 0, RCS },
> +               { I915_ENGINE_CLASS_COPY, 0, BCS },
> +               { I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
> +               { I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
> +               { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
> +               { 0, 0, VCS }
> +       };
> +
> +       bb->num_engines = 0;
> +       bb->first = true;
> +       bb->fd = -1;
> +
> +       for (d = &engines[0]; d->id != VCS; d++) {
> +               int pfd;
> +
> +               pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
> +                                                               d->inst),
> +                                          bb->fd);
> +               if (pfd < 0) {
> +                       if (d->id != VCS2)
> +                               return -(10 + bb->num_engines);
> +                       else
> +                               continue;
> +               }
> +
> +               if (bb->num_engines == 0)
> +                       bb->fd = pfd;
> +
> +               bb->engine_map[d->id] = bb->num_engines++;
> +       }
> +
> +       if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP))
> +               return -1;

Hmm, feels a little sloppy. Would be more concrete if we have a list of
engines were going to use, and then we either created a perf event for
each or died.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* ✓ Fi.CI.IGT: success for IGT PMU support (rev4)
  2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
                   ` (7 preceding siblings ...)
  2017-09-25 16:06 ` ✓ Fi.CI.BAT: success for IGT PMU support (rev4) Patchwork
@ 2017-09-25 22:16 ` Patchwork
  8 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2017-09-25 22:16 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

== Series Details ==

Series: IGT PMU support (rev4)
URL   : https://patchwork.freedesktop.org/series/28253/
State : success

== Summary ==

Test gem_eio:
        Subgroup in-flight:
                pass       -> DMESG-WARN (shard-hsw) fdo#102886 +3
Test kms_busy:
        Subgroup extended-modeset-hang-oldfb-with-reset-render-B:
                pass       -> DMESG-WARN (shard-hsw) fdo#102249
Test prime_self_import:
        Subgroup reimport-vs-gem_close-race:
                pass       -> FAIL       (shard-hsw) fdo#102655
Test perf:
        Subgroup blocking:
                fail       -> PASS       (shard-hsw) fdo#102252
Test prime_mmap:
        Subgroup test_userptr:
                dmesg-warn -> PASS       (shard-hsw) fdo#102939

fdo#102886 https://bugs.freedesktop.org/show_bug.cgi?id=102886
fdo#102249 https://bugs.freedesktop.org/show_bug.cgi?id=102249
fdo#102655 https://bugs.freedesktop.org/show_bug.cgi?id=102655
fdo#102252 https://bugs.freedesktop.org/show_bug.cgi?id=102252
fdo#102939 https://bugs.freedesktop.org/show_bug.cgi?id=102939

shard-hsw        total:2520 pass:1325 dwarn:5   dfail:0   fail:16  skip:1174 time:9914s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_248/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library
  2017-09-25 15:22   ` Chris Wilson
@ 2017-09-26 10:52     ` Tvrtko Ursulin
  2017-09-26 11:07       ` Chris Wilson
  0 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-26 10:52 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Intel-gfx


On 25/09/2017 16:22, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-09-25 16:14:56)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   lib/Makefile.sources             | 2 ++
>>   overlay/perf.c => lib/igt_perf.c | 2 +-
>>   overlay/perf.h => lib/igt_perf.h | 2 ++
>>   overlay/Makefile.am              | 6 ++----
>>   overlay/gem-interrupts.c         | 3 ++-
>>   overlay/gpu-freq.c               | 3 ++-
>>   overlay/gpu-perf.c               | 3 ++-
>>   overlay/gpu-top.c                | 3 ++-
>>   overlay/power.c                  | 3 ++-
>>   overlay/rc6.c                    | 3 ++-
>>   10 files changed, 19 insertions(+), 11 deletions(-)
>>   rename overlay/perf.c => lib/igt_perf.c (94%)
>>   rename overlay/perf.h => lib/igt_perf.h (99%)
>>
>> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
>> index 53fdb54cbfa5..c031cb502469 100644
>> --- a/lib/Makefile.sources
>> +++ b/lib/Makefile.sources
>> @@ -16,6 +16,8 @@ lib_source_list =             \
>>          igt_gt.h                \
>>          igt_gvt.c               \
>>          igt_gvt.h               \
>> +       igt_perf.c              \
>> +       igt_perf.h              \
>>          igt_primes.c            \
>>          igt_primes.h            \
>>          igt_rand.c              \
>> diff --git a/overlay/perf.c b/lib/igt_perf.c
>> similarity index 94%
>> rename from overlay/perf.c
>> rename to lib/igt_perf.c
>> index b8fdc675c587..45cccff0ae53 100644
>> --- a/overlay/perf.c
>> +++ b/lib/igt_perf.c
>> @@ -3,7 +3,7 @@
>>   #include <unistd.h>
>>   #include <stdlib.h>
>>   
>> -#include "perf.h"
>> +#include "igt_perf.h"
>>   
>>   uint64_t i915_type_id(void)
>>   {
>> diff --git a/overlay/perf.h b/lib/igt_perf.h
>> similarity index 99%
>> rename from overlay/perf.h
>> rename to lib/igt_perf.h
>> index c44e65f9734c..a80b311cd1d1 100644
>> --- a/overlay/perf.h
>> +++ b/lib/igt_perf.h
>> @@ -1,6 +1,8 @@
>>   #ifndef I915_PERF_H
>>   #define I915_PERF_H
>>   
>> +#include <stdint.h>
>> +
>>   #include <linux/perf_event.h>
>>   
>>   #define I915_SAMPLE_BUSY       0
>> diff --git a/overlay/Makefile.am b/overlay/Makefile.am
>> index 5472514efc16..c66a80f4e571 100644
>> --- a/overlay/Makefile.am
>> +++ b/overlay/Makefile.am
>> @@ -4,8 +4,8 @@ endif
>>   
>>   AM_CPPFLAGS = -I.
>>   AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) \
>> -       $(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS)
>> -LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS)
>> +       $(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS) -I$(srcdir)/../lib
>> +LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS) $(top_builddir)/lib/libintel_tools.la
> 
> It was a conscious decision that tools wouldn't pull in the test lib. We
> are nowhere close to having a stable library api, i.e. a long way from
> being installable. An issue if this is turned into a .so, which has
> happened in a nearby universe.

I don't exactly follow why we would care about stable library API since 
no outside projects will be linking to our libraries. But anyway, I'll 
try to make overlay not link wit libintel_tools but just statically 
embed the perf file. Ok with that?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU
  2017-09-25 15:31   ` Chris Wilson
@ 2017-09-26 10:56     ` Tvrtko Ursulin
  2017-09-26 11:13       ` Chris Wilson
  0 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-26 10:56 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Intel-gfx


On 25/09/2017 16:31, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-09-25 16:14:59)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   lib/igt_perf.h           | 93 ++++++++++++++++++++++++++++++++++--------------
>>   overlay/gem-interrupts.c |  2 +-
>>   overlay/gpu-freq.c       |  4 +--
>>   overlay/gpu-top.c        | 68 +++++++++++++++++++----------------
>>   overlay/power.c          |  4 +--
>>   overlay/rc6.c            |  6 ++--
>>   6 files changed, 111 insertions(+), 66 deletions(-)
>>
>> diff --git a/lib/igt_perf.h b/lib/igt_perf.h
>> index 8e674c3a3755..e29216f0500a 100644
>> --- a/lib/igt_perf.h
>> +++ b/lib/igt_perf.h
>> @@ -1,3 +1,27 @@
>> +/*
>> + * Copyright © 2017 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + */
>> +
>>   #ifndef I915_PERF_H
>>   #define I915_PERF_H
>>   
>> @@ -5,41 +29,56 @@
>>   
>>   #include <linux/perf_event.h>
>>   
>> -#define I915_SAMPLE_BUSY       0
>> -#define I915_SAMPLE_WAIT       1
>> -#define I915_SAMPLE_SEMA       2
>> +enum drm_i915_gem_engine_class {
>> +       I915_ENGINE_CLASS_OTHER = 0,
>> +       I915_ENGINE_CLASS_RENDER = 1,
>> +       I915_ENGINE_CLASS_COPY = 2,
>> +       I915_ENGINE_CLASS_VIDEO = 3,
>> +       I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
>> +       I915_ENGINE_CLASS_MAX /* non-ABI */
>> +};
>> +
>> +enum drm_i915_pmu_engine_sample {
>> +       I915_SAMPLE_QUEUED = 0,
>> +       I915_SAMPLE_BUSY = 1,
>> +       I915_SAMPLE_WAIT = 2,
>> +       I915_SAMPLE_SEMA = 3,
>> +       I915_ENGINE_SAMPLE_MAX /* non-ABI */
>> +};
>> +
>> +#define I915_PMU_SAMPLE_BITS (4)
>> +#define I915_PMU_SAMPLE_MASK (0xf)
>> +#define I915_PMU_SAMPLE_INSTANCE_BITS (8)
>> +#define I915_PMU_CLASS_SHIFT \
>> +       (I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS)
>>   
>> -#define I915_SAMPLE_RCS                0
>> -#define I915_SAMPLE_VCS                1
>> -#define I915_SAMPLE_BCS                2
>> -#define I915_SAMPLE_VECS       3
>> +#define __I915_PMU_ENGINE(class, instance, sample) \
>> +       ((class) << I915_PMU_CLASS_SHIFT | \
>> +       (instance) << I915_PMU_SAMPLE_BITS | \
>> +       (sample))
>>   
>> -#define __I915_PERF_COUNT(ring, id) ((ring) << 4 | (id))
>> +#define I915_PMU_ENGINE_QUEUED(class, instance) \
>> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_QUEUED)
>>   
>> -#define I915_PERF_COUNT_RCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_BUSY)
>> -#define I915_PERF_COUNT_RCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_WAIT)
>> -#define I915_PERF_COUNT_RCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_RCS, I915_SAMPLE_SEMA)
>> +#define I915_PMU_ENGINE_BUSY(class, instance) \
>> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
>>   
>> -#define I915_PERF_COUNT_VCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_BUSY)
>> -#define I915_PERF_COUNT_VCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_WAIT)
>> -#define I915_PERF_COUNT_VCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_VCS, I915_SAMPLE_SEMA)
>> +#define I915_PMU_ENGINE_WAIT(class, instance) \
>> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
>>   
>> -#define I915_PERF_COUNT_BCS_BUSY __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_BUSY)
>> -#define I915_PERF_COUNT_BCS_WAIT __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_WAIT)
>> -#define I915_PERF_COUNT_BCS_SEMA __I915_PERF_COUNT(I915_SAMPLE_BCS, I915_SAMPLE_SEMA)
>> +#define I915_PMU_ENGINE_SEMA(class, instance) \
>> +       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
>>   
>> -#define I915_PERF_COUNT_VECS_BUSY __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_BUSY)
>> -#define I915_PERF_COUNT_VECS_WAIT __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_WAIT)
>> -#define I915_PERF_COUNT_VECS_SEMA __I915_PERF_COUNT(I915_SAMPLE_VECS, I915_SAMPLE_SEMA)
>> +#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
>>   
>> -#define I915_PERF_ACTUAL_FREQUENCY 32
>> -#define I915_PERF_REQUESTED_FREQUENCY 33
>> -#define I915_PERF_ENERGY 34
>> -#define I915_PERF_INTERRUPTS 35
>> +#define I915_PMU_ACTUAL_FREQUENCY      __I915_PMU_OTHER(0)
>> +#define I915_PMU_REQUESTED_FREQUENCY   __I915_PMU_OTHER(1)
>> +#define I915_PMU_INTERRUPTS            __I915_PMU_OTHER(2)
>> +#define I915_PMU_RC6_RESIDENCY         __I915_PMU_OTHER(3)
>> +#define I915_PMU_RC6p_RESIDENCY                __I915_PMU_OTHER(4)
>> +#define I915_PMU_RC6pp_RESIDENCY       __I915_PMU_OTHER(5)
>>   
>> -#define I915_PERF_RC6_RESIDENCY                40
>> -#define I915_PERF_RC6p_RESIDENCY       41
>> -#define I915_PERF_RC6pp_RESIDENCY      42
>> +#define I915_PMU_LAST I915_PMU_RC6pp_RESIDENCY
>>   
>>   static inline int
>>   perf_event_open(struct perf_event_attr *attr,
>> diff --git a/overlay/gem-interrupts.c b/overlay/gem-interrupts.c
>> index 3eda24f4d7eb..add4a9dfd725 100644
>> --- a/overlay/gem-interrupts.c
>> +++ b/overlay/gem-interrupts.c
>> @@ -113,7 +113,7 @@ int gem_interrupts_init(struct gem_interrupts *irqs)
>>   {
>>          memset(irqs, 0, sizeof(*irqs));
>>   
>> -       irqs->fd = perf_i915_open(I915_PERF_INTERRUPTS);
>> +       irqs->fd = perf_i915_open(I915_PMU_INTERRUPTS);
>>          if (irqs->fd < 0 && interrupts_read() < 0)
>>                  irqs->error = ENODEV;
>>   
>> diff --git a/overlay/gpu-freq.c b/overlay/gpu-freq.c
>> index 76c5ed9acfd1..c4619b87242a 100644
>> --- a/overlay/gpu-freq.c
>> +++ b/overlay/gpu-freq.c
>> @@ -37,8 +37,8 @@ static int perf_open(void)
>>   {
>>          int fd;
>>   
>> -       fd = perf_i915_open_group(I915_PERF_ACTUAL_FREQUENCY, -1);
>> -       if (perf_i915_open_group(I915_PERF_REQUESTED_FREQUENCY, fd) < 0) {
>> +       fd = perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, -1);
>> +       if (perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, fd) < 0) {
>>                  close(fd);
>>                  fd = -1;
>>          }
>> diff --git a/overlay/gpu-top.c b/overlay/gpu-top.c
>> index 812f47d5aced..61b8f62fd78c 100644
>> --- a/overlay/gpu-top.c
>> +++ b/overlay/gpu-top.c
>> @@ -43,49 +43,57 @@
>>   #define   RING_WAIT            (1<<11)
>>   #define   RING_WAIT_SEMAPHORE  (1<<10)
>>   
>> -#define __I915_PERF_RING(n) (4*n)
>> -#define I915_PERF_RING_BUSY(n) (__I915_PERF_RING(n) + 0)
>> -#define I915_PERF_RING_WAIT(n) (__I915_PERF_RING(n) + 1)
>> -#define I915_PERF_RING_SEMA(n) (__I915_PERF_RING(n) + 2)
>> -
>>   static int perf_init(struct gpu_top *gt)
>>   {
>> -       const char *names[] = {
>> -               "RCS",
>> -               "BCS",
>> -               "VCS0",
>> -               "VCS1",
>> -               NULL,
>> +       struct engine_desc {
>> +               unsigned class, inst;
>> +               const char *name;
>> +       } *d, engines[] = {
>> +               { I915_ENGINE_CLASS_RENDER, 0, "rcs0" },
>> +               { I915_ENGINE_CLASS_COPY, 0, "bcs0" },
>> +               { I915_ENGINE_CLASS_VIDEO, 0, "vcs0" },
>> +               { I915_ENGINE_CLASS_VIDEO, 1, "vcs1" },
>> +               { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, "vecs0" },
> 
> Hmm, there is some hidden coupling with colours atm, but other than that
> the order is flexible, iirc.

What do you mean? First you thought there's some issue but there isn't 
after all?

Oh right, VECS wasn't on the list before.. it seems to work anyway. Just 
haven't tried with five engines.

> 
>> +               { 0, 0, NULL }
>>          };
>> -       int n;
>>   
>> -       gt->fd = perf_i915_open_group(I915_PERF_RING_BUSY(0), -1);
>> +       d = &engines[0];
>> +
>> +       gt->fd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class, d->inst),
>> +                                     -1);
>>          if (gt->fd < 0)
>>                  return -1;
>>   
>> -       if (perf_i915_open_group(I915_PERF_RING_WAIT(0), gt->fd) >= 0)
>> +       if (perf_i915_open_group(I915_PMU_ENGINE_WAIT(d->class, d->inst),
>> +                                gt->fd) >= 0)
>>                  gt->have_wait = 1;
>>   
>> -       if (perf_i915_open_group(I915_PERF_RING_SEMA(0), gt->fd) >= 0)
>> +       if (perf_i915_open_group(I915_PMU_ENGINE_SEMA(d->class, d->inst),
>> +                                gt->fd) >= 0)
>>                  gt->have_sema = 1;
>>   
>> -       gt->ring[0].name = names[0];
>> +       gt->ring[0].name = d->name;
>>          gt->num_rings = 1;
>>   
>> -       for (n = 1; names[n]; n++) {
>> -               if (perf_i915_open_group(I915_PERF_RING_BUSY(n), gt->fd) >= 0) {
>> -                       if (gt->have_wait &&
>> -                           perf_i915_open_group(I915_PERF_RING_WAIT(n),
>> -                                                gt->fd) < 0)
>> -                               return -1;
>> -
>> -                       if (gt->have_sema &&
>> -                           perf_i915_open_group(I915_PERF_RING_SEMA(n),
>> -                                                gt->fd) < 0)
>> -                               return -1;
>> -
>> -                       gt->ring[gt->num_rings++].name = names[n];
>> -               }
>> +       for (d++; d->name; d++) {
>> +               if (perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
>> +                                                             d->inst),
>> +                                       gt->fd) < 0)
>> +                       continue;
>> +
>> +               if (gt->have_wait &&
>> +                   perf_i915_open_group(I915_PMU_ENGINE_WAIT(d->class,
>> +                                                             d->inst),
>> +                                        gt->fd) < 0)
>> +                       return -1;
>> +
>> +               if (gt->have_sema &&
>> +                   perf_i915_open_group(I915_PMU_ENGINE_SEMA(d->class,
>> +                                                             d->inst),
>> +                                  gt->fd) < 0)
>> +                       return -1;
>> +
>> +               gt->ring[gt->num_rings++].name = d->name;
>>          }
>>   
>>          return 0;
>> diff --git a/overlay/power.c b/overlay/power.c
>> index dd4aec6bffd9..805f4ca7805c 100644
>> --- a/overlay/power.c
>> +++ b/overlay/power.c
>> @@ -45,9 +45,7 @@ int power_init(struct power *power)
>>   
>>          memset(power, 0, sizeof(*power));
>>   
>> -       power->fd = perf_i915_open(I915_PERF_ENERGY);
>> -       if (power->fd != -1)
>> -               return 0;
>> +       power->fd = -1;
> 
> Hmm, didn't you say that the rapl values were exposed via perf as well?

Yes, I planned to add this back afterwards but can have it as part of 
this series as well.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library
  2017-09-26 10:52     ` Tvrtko Ursulin
@ 2017-09-26 11:07       ` Chris Wilson
  0 siblings, 0 replies; 25+ messages in thread
From: Chris Wilson @ 2017-09-26 11:07 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-26 11:52:28)
> 
> On 25/09/2017 16:22, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2017-09-25 16:14:56)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> ---
> >>   lib/Makefile.sources             | 2 ++
> >>   overlay/perf.c => lib/igt_perf.c | 2 +-
> >>   overlay/perf.h => lib/igt_perf.h | 2 ++
> >>   overlay/Makefile.am              | 6 ++----
> >>   overlay/gem-interrupts.c         | 3 ++-
> >>   overlay/gpu-freq.c               | 3 ++-
> >>   overlay/gpu-perf.c               | 3 ++-
> >>   overlay/gpu-top.c                | 3 ++-
> >>   overlay/power.c                  | 3 ++-
> >>   overlay/rc6.c                    | 3 ++-
> >>   10 files changed, 19 insertions(+), 11 deletions(-)
> >>   rename overlay/perf.c => lib/igt_perf.c (94%)
> >>   rename overlay/perf.h => lib/igt_perf.h (99%)
> >>
> >> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> >> index 53fdb54cbfa5..c031cb502469 100644
> >> --- a/lib/Makefile.sources
> >> +++ b/lib/Makefile.sources
> >> @@ -16,6 +16,8 @@ lib_source_list =             \
> >>          igt_gt.h                \
> >>          igt_gvt.c               \
> >>          igt_gvt.h               \
> >> +       igt_perf.c              \
> >> +       igt_perf.h              \
> >>          igt_primes.c            \
> >>          igt_primes.h            \
> >>          igt_rand.c              \
> >> diff --git a/overlay/perf.c b/lib/igt_perf.c
> >> similarity index 94%
> >> rename from overlay/perf.c
> >> rename to lib/igt_perf.c
> >> index b8fdc675c587..45cccff0ae53 100644
> >> --- a/overlay/perf.c
> >> +++ b/lib/igt_perf.c
> >> @@ -3,7 +3,7 @@
> >>   #include <unistd.h>
> >>   #include <stdlib.h>
> >>   
> >> -#include "perf.h"
> >> +#include "igt_perf.h"
> >>   
> >>   uint64_t i915_type_id(void)
> >>   {
> >> diff --git a/overlay/perf.h b/lib/igt_perf.h
> >> similarity index 99%
> >> rename from overlay/perf.h
> >> rename to lib/igt_perf.h
> >> index c44e65f9734c..a80b311cd1d1 100644
> >> --- a/overlay/perf.h
> >> +++ b/lib/igt_perf.h
> >> @@ -1,6 +1,8 @@
> >>   #ifndef I915_PERF_H
> >>   #define I915_PERF_H
> >>   
> >> +#include <stdint.h>
> >> +
> >>   #include <linux/perf_event.h>
> >>   
> >>   #define I915_SAMPLE_BUSY       0
> >> diff --git a/overlay/Makefile.am b/overlay/Makefile.am
> >> index 5472514efc16..c66a80f4e571 100644
> >> --- a/overlay/Makefile.am
> >> +++ b/overlay/Makefile.am
> >> @@ -4,8 +4,8 @@ endif
> >>   
> >>   AM_CPPFLAGS = -I.
> >>   AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) \
> >> -       $(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS)
> >> -LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS)
> >> +       $(CAIRO_CFLAGS) $(OVERLAY_CFLAGS) $(WERROR_CFLAGS) -I$(srcdir)/../lib
> >> +LDADD = $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(OVERLAY_LIBS) $(top_builddir)/lib/libintel_tools.la
> > 
> > It was a conscious decision that tools wouldn't pull in the test lib. We
> > are nowhere close to having a stable library api, i.e. a long way from
> > being installable. An issue if this is turned into a .so, which has
> > happened in a nearby universe.
> 
> I don't exactly follow why we would care about stable library API since 
> no outside projects will be linking to our libraries.

Hah. Just wait, if we install a .so, expect the complaints to start
rolling in. :-p

> But anyway, I'll 
> try to make overlay not link wit libintel_tools but just statically 
> embed the perf file. Ok with that?

I don't mind if we do start using a .so, I just advise that if we do, we
don't just package up the current libigt.so as it is a mess of an API
(not a library, more a collection of hacks).

For this case, I would create a noinst_LTLIBRARY = libigt_perf.la
Might as well use automake/libtool for one good thing :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU
  2017-09-26 10:56     ` Tvrtko Ursulin
@ 2017-09-26 11:13       ` Chris Wilson
  0 siblings, 0 replies; 25+ messages in thread
From: Chris Wilson @ 2017-09-26 11:13 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-26 11:56:24)
> 
> On 25/09/2017 16:31, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2017-09-25 16:14:59)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >> ---
> >> diff --git a/overlay/gpu-top.c b/overlay/gpu-top.c
> >> index 812f47d5aced..61b8f62fd78c 100644
> >> --- a/overlay/gpu-top.c
> >> +++ b/overlay/gpu-top.c
> >> @@ -43,49 +43,57 @@
> >>   #define   RING_WAIT            (1<<11)
> >>   #define   RING_WAIT_SEMAPHORE  (1<<10)
> >>   
> >> -#define __I915_PERF_RING(n) (4*n)
> >> -#define I915_PERF_RING_BUSY(n) (__I915_PERF_RING(n) + 0)
> >> -#define I915_PERF_RING_WAIT(n) (__I915_PERF_RING(n) + 1)
> >> -#define I915_PERF_RING_SEMA(n) (__I915_PERF_RING(n) + 2)
> >> -
> >>   static int perf_init(struct gpu_top *gt)
> >>   {
> >> -       const char *names[] = {
> >> -               "RCS",
> >> -               "BCS",
> >> -               "VCS0",
> >> -               "VCS1",
> >> -               NULL,
> >> +       struct engine_desc {
> >> +               unsigned class, inst;
> >> +               const char *name;
> >> +       } *d, engines[] = {
> >> +               { I915_ENGINE_CLASS_RENDER, 0, "rcs0" },
> >> +               { I915_ENGINE_CLASS_COPY, 0, "bcs0" },
> >> +               { I915_ENGINE_CLASS_VIDEO, 0, "vcs0" },
> >> +               { I915_ENGINE_CLASS_VIDEO, 1, "vcs1" },
> >> +               { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, "vecs0" },
> > 
> > Hmm, there is some hidden coupling with colours atm, but other than that
> > the order is flexible, iirc.
> 
> What do you mean? First you thought there's some issue but there isn't 
> after all?

Just the way the colours get assigned later makes some assumptions that
no longer hold true. I was thinking it would make more sense to move the
colour field into the ring struct; even more sense to make it all
customisable. (In my defence, it was never meant to be perfect! Just a
snowball.)

> Oh right, VECS wasn't on the list before.. it seems to work anyway. Just 
> haven't tried with five engines.

It should just work, just don't show the result to a UI designer.

> >> diff --git a/overlay/power.c b/overlay/power.c
> >> index dd4aec6bffd9..805f4ca7805c 100644
> >> --- a/overlay/power.c
> >> +++ b/overlay/power.c
> >> @@ -45,9 +45,7 @@ int power_init(struct power *power)
> >>   
> >>          memset(power, 0, sizeof(*power));
> >>   
> >> -       power->fd = perf_i915_open(I915_PERF_ENERGY);
> >> -       if (power->fd != -1)
> >> -               return 0;
> >> +       power->fd = -1;
> > 
> > Hmm, didn't you say that the rapl values were exposed via perf as well?
> 
> Yes, I planned to add this back afterwards but can have it as part of 
> this series as well.

Ok. I was curious to see what it looked like; wondering if it would just
plug right in and if our usage of perf_event looked reasonably similar.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API
  2017-09-25 16:21   ` Chris Wilson
@ 2017-09-26 11:19     ` Tvrtko Ursulin
  2017-09-26 11:42       ` Chris Wilson
  0 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-26 11:19 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Intel-gfx


On 25/09/2017 17:21, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-09-25 16:15:00)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> A bunch of tests for the new i915 PMU feature.
>>
>> Parts of the code were initialy sketched by Dmitry Rogozhkin.
>>
>> v2: (Most suggestions by Chris Wilson)
>>   * Add new class/instance based engine list.
>>   * Add gem_has_engine/gem_require_engine to work with class/instance.
>>   * Use the above two throughout the test.
>>   * Shorten tests to 100ms busy batches, seems enough.
>>   * Add queued counter sanity checks.
>>   * Use igt_nsec_elapsed.
>>   * Skip on perf -ENODEV in some tests instead of embedding knowledge locally.
>>   * Fix multi ordering for busy accounting.
>>   * Use new guranteed_usleep when sleep time is asserted on.
>>   * Check for no queued when idle/busy.
>>   * Add queued counter init test.
>>   * Add queued tests.
>>   * Consolidate and increase multiple busy engines tests to most-busy and
>>     all-busy tests.
>>   * Guarantte interrupts by using fences.
>>   * Test RC6 via forcewake.
>>
>> v3:
>>   * Tweak assert in interrupts subtest.
>>   * Sprinkle of comments.
>>   * Fix multi-client test which got broken in v2.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
>> ---
>>   lib/igt_gt.c           |  50 +++
>>   lib/igt_gt.h           |  38 +++
>>   lib/igt_perf.h         |   9 +-
>>   tests/Makefile.sources |   1 +
>>   tests/perf_pmu.c       | 869 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   5 files changed, 959 insertions(+), 8 deletions(-)
>>   create mode 100644 tests/perf_pmu.c
>>
>> diff --git a/lib/igt_gt.c b/lib/igt_gt.c
>> index b3f3b3809eee..4c75811fb1b3 100644
>> --- a/lib/igt_gt.c
>> +++ b/lib/igt_gt.c
>> @@ -568,3 +568,53 @@ bool gem_can_store_dword(int fd, unsigned int engine)
>>   
>>          return true;
>>   }
>> +
> 
>> +const struct intel_execution_engine2 intel_execution_engines2[] = {
> 
> If you want to do a quick
> 	s/intel_execution_engine/intel_execution_legacy_ring/
> or wait until class/inst is the natural execbuf interface?

I think it would be premature. Class/instance userspace is nowhere in sight.

>> +const double tolerance = 0.02f;
>> +const unsigned long batch_duration_ns = 100 * 1000 * 1000;
>> +
>> +static int open_pmu(uint64_t config)
>> +{
>> +       int fd;
>> +
>> +       fd = perf_i915_open(config);
>> +       igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
>> +       igt_assert(fd >= 0);
> 
> So why are we not allowed to disable perf counters?

I suppose you are aiming here at errno will be different in that case 
and we should skip as well. Another TODO to check which one exactly it is.

> 
>> +       return fd;
>> +}
>> +
>> +static int open_group(uint64_t config, int group)
>> +{
>> +       int fd;
>> +
>> +       fd = perf_i915_open_group(config, group);
>> +       igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
>> +       igt_assert(fd >= 0);
>> +
>> +       return fd;
>> +}
>> +
>> +static void
>> +init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample)
>> +{
>> +       int fd;
>> +
>> +       fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
>> +
>> +       close(fd);
>> +}
>> +
>> +static uint64_t pmu_read_single(int fd)
>> +{
>> +       uint64_t data[2];
>> +       ssize_t len;
>> +
>> +       len = read(fd, data, sizeof(data));
>> +       igt_assert_eq(len, sizeof(data));
> 
> read() isn't a complicated macro, so won't this have a reasonable
> string as igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)) ?

Okay. :)

> 
>> +       return data[0];
>> +}
>> +
>> +static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>> +{
>> +       uint64_t buf[2 + num];
>> +       unsigned int i;
>> +       ssize_t len;
>> +
>> +       len = read(fd, buf, sizeof(buf));
>> +       igt_assert_eq(len, sizeof(buf));
>> +       for (i = 0; i < num; i++)
>> +               val[i] = buf[2 + i];
>> +}
>> +
>> +#define assert_within_epsilon(x, ref, tolerance) \
>> +       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
>> +                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
>> +                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
>> +                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
>> +
>> +/*
>> + * Helper for cases where we assert on time spent sleeping (directly or
>> + * indirectly), so make it more robust by ensuring the system sleep time
>> + * is within test tolerance to start with.
>> + */
> 
> Looking at the callers, I don't see where we need a guaranteed sleep vs
> a known sleep. We are comparing elapsed/wall time with a perf counter, so
> we only care that they match. By asserting we just cause random fails if
> the system decides not to wake up for a 1s (outside the purvey of the
> test).

Okay so you suggest measured_usleep instead of guaranteed_usleep. And 
then the measured time used in the asserts. Makes sense, just please 
confirm that there hasn't been a misunderstanding.

> 
>> +static void guaranteed_usleep(unsigned int usec)
>> +{
>> +       uint64_t slept = 0, to_sleep = usec;
>> +
>> +       while (usec > 0) {
>> +               struct timespec start = { };
>> +               uint64_t this_sleep;
>> +
>> +               igt_nsec_elapsed(&start);
>> +               usleep(usec);
>> +               this_sleep = igt_nsec_elapsed(&start) / 1000;
>> +               slept += this_sleep;
>> +               if (this_sleep > usec)
>> +                       break;
>> +               usec -= this_sleep;
>> +       }
>> +
>> +       assert_within_epsilon(slept, to_sleep, tolerance);
>> +}
>> +
>> +static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       return gem_class_instance_to_eb_flags(gem_fd, e->class, e->instance);
>> +}
>> +
>> +static void
>> +single(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample,
>> +       bool busy)
>> +{
>> +       double ref = busy && sample == I915_SAMPLE_BUSY ?
>> +                    batch_duration_ns : 0.0f;
>> +       igt_spin_t *spin;
>> +       uint64_t val;
>> +       int fd;
>> +
>> +       fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
>> +
>> +       if (busy) {
>> +               spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
>> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +       } else {
>> +               guaranteed_usleep(batch_duration_ns / 1000);
>> +       }
>> +
>> +       if (busy)
>> +               gem_sync(gem_fd, spin->handle);
>> +
>> +       val = pmu_read_single(fd);
>> +
>> +       assert_within_epsilon(val, ref, tolerance);
>> +
>> +       if (busy)
>> +               igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
>> +}
>> +
>> +static void
>> +queued(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       igt_spin_t *spin[2];
>> +       uint64_t val;
>> +       int fd;
>> +
>> +       fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
>> +
>> +       /*
>> +        * First spin batch will be immediately executing.
>> +        */
>> +       spin[0] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
>> +       igt_spin_batch_set_timeout(spin[0], batch_duration_ns);
>> +
>> +       /*
>> +        * Second spin batch will sit in the execution queue behind the
>> +        * first one so must cause the PMU to correctly report the queued
>> +        * counter.
>> +        */
> 
> Hmm, the second to the same ctx should be executed. We will see that
> port[1] is empty and so ask if we can fit the batch into port[1]. In
> fact, it turns out that we can coalesce the batch with port[0].
> Once coalesced, we can't distinguish the two requests since they are a
> single context in the ELSP.
> 
> All this is presuming execlists...
> 
> iirc execbuf-ioctl() -> [untracked]
>    \- submit_request -> QUEUED
>        \- execlists_dequeue -> BUSY

Lets discuss this one in the i915 patch thread. It is about defining 
semantics of the queued counter.

> 
> 
>> +       spin[1] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
>> +       igt_spin_batch_set_timeout(spin[1], batch_duration_ns);
>> +
>> +       gem_sync(gem_fd, spin[0]->handle);
>> +
>> +       val = pmu_read_single(fd);
>> +       assert_within_epsilon(val, batch_duration_ns, tolerance);
>> +
>> +       gem_sync(gem_fd, spin[1]->handle);
>> +
>> +       igt_spin_batch_free(gem_fd, spin[0]);
>> +       igt_spin_batch_free(gem_fd, spin[1]);
>> +       close(fd);
>> +}
>> +
>> +static void
>> +busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
>> +              const unsigned int num_engines)
>> +{
>> +       const struct intel_execution_engine2 *e_;
>> +       uint64_t val[num_engines];
>> +       int fd[num_engines];
>> +       igt_spin_t *spin;
>> +       unsigned int busy_idx, i;
>> +
>> +       i = 0;
>> +       fd[0] = -1;
>> +       for_each_engine_class_instance(fd, e_) {
>> +               if (!gem_has_engine(gem_fd, e_->class, e_->instance))
>> +                       continue;
>> +               else if (e == e_)
>> +                       busy_idx = i;
>> +
>> +               fd[i++] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
>> +                                                         e_->instance),
>> +                                    fd[0]);
>> +       }
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
>> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       pmu_read_multi(fd[0], num_engines, val);
>> +
>> +       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
>> +       for (i = 0; i < num_engines; i++) {
> 
> double expected;
> 
> if (i == busy_idx)
> 	expected = batch_duration_ns;
> else
> 	expected = 0;
> assert_within_epsilon(val[i], expected, tolerance);

I don't see any advantage but can change it sure.

> 
> Probably worth a preceding
> 
> for (i = 0; i < num_engines; i++)
> 	len += sprintf(buf + len, "%s%s=%f, ", i == busy_idx ? "*" : "", name[i], val[i]);
> buf[len-2] = '\0;'
> igt_info("%s\n", buf);

Can do.

> 
> How about batches executing from a different context, or submitted from
> a different fd?

I can't see that adding much value? Am I thinking too much white box? 
The perf API has nothing to do with the drm fd so I just see it not 
relevant.

>> +               if (i == busy_idx)
>> +                       continue;
>> +               assert_within_epsilon(val[i], 0.0f, tolerance);
>> +       }
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd[0]);
>> +}
>> +
>> +static void
>> +most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
>> +                   const unsigned int num_engines)
>> +{
>> +       const struct intel_execution_engine2 *e_;
>> +       uint64_t val[num_engines];
>> +       int fd[num_engines];
>> +       igt_spin_t *spin[num_engines];
>> +       unsigned int idle_idx, i;
>> +
>> +       gem_require_engine(gem_fd, e->class, e->instance);
>> +
>> +       i = 0;
>> +       fd[0] = -1;
>> +       for_each_engine_class_instance(fd, e_) {
>> +               if (!gem_has_engine(gem_fd, e_->class, e_->instance))
>> +                       continue;
>> +
>> +               fd[i] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
>> +                                                       e_->instance),
>> +                                  fd[0]);
>> +
>> +               if (e == e_) {
>> +                       idle_idx = i;
>> +               } else {
>> +                       spin[i] = igt_spin_batch_new(gem_fd, 0,
>> +                                                    e2ring(gem_fd, e_), 0);
>> +                       igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
>> +               }
>> +
>> +               i++;
>> +       }
>> +
>> +       for (i = 0; i < num_engines; i++) {
>> +               if (i != idle_idx)
>> +                       gem_sync(gem_fd, spin[i]->handle);
>> +       }
>> +
>> +       pmu_read_multi(fd[0], num_engines, val);
>> +
>> +       for (i = 0; i < num_engines; i++) {
>> +               if (i == idle_idx)
>> +                       assert_within_epsilon(val[i], 0.0f, tolerance);
>> +               else
>> +                       assert_within_epsilon(val[i], batch_duration_ns,
>> +                                             tolerance);
>> +       }
>> +
>> +       for (i = 0; i < num_engines; i++) {
>> +               if (i != idle_idx)
>> +                       igt_spin_batch_free(gem_fd, spin[i]);
>> +       }
>> +       close(fd[0]);
>> +}
>> +
>> +static void
>> +all_busy_check_all(int gem_fd, const unsigned int num_engines)
>> +{
>> +       const struct intel_execution_engine2 *e;
>> +       uint64_t val[num_engines];
>> +       int fd[num_engines];
>> +       igt_spin_t *spin[num_engines];
>> +       unsigned int i;
>> +
>> +       i = 0;
>> +       fd[0] = -1;
>> +       for_each_engine_class_instance(fd, e) {
>> +               if (!gem_has_engine(gem_fd, e->class, e->instance))
>> +                       continue;
>> +
>> +               fd[i] = open_group(I915_PMU_ENGINE_BUSY(e->class, e->instance),
>> +                                  fd[0]);
>> +
>> +               spin[i] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
>> +               igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
>> +
>> +               i++;
>> +       }
>> +
>> +       for (i = 0; i < num_engines; i++)
>> +               gem_sync(gem_fd, spin[i]->handle);
>> +
>> +       pmu_read_multi(fd[0], num_engines, val);
>> +
>> +       for (i = 0; i < num_engines; i++)
>> +               assert_within_epsilon(val[i], batch_duration_ns, tolerance);
>> +
>> +       for (i = 0; i < num_engines; i++)
>> +               igt_spin_batch_free(gem_fd, spin[i]);
>> +       close(fd[0]);
>> +}
>> +
>> +static void
>> +no_sema(int gem_fd, const struct intel_execution_engine2 *e, bool busy)
>> +{
>> +       igt_spin_t *spin;
>> +       uint64_t val[2];
>> +       int fd;
>> +
>> +       fd = open_group(I915_PMU_ENGINE_SEMA(e->class, e->instance), -1);
>> +       open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
>> +
>> +       if (busy) {
>> +               spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
>> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);
> 
> These busy paths are not serialised to the batch, it is quite possible
> for the pmu_read to happen before the spin is even scheduled for
> execution on the hw.

Mr Refactoring stole my gem_sync call. :)

> 
>> +       } else {
>> +               usleep(batch_duration_ns / 1000);
>> +       }
>> +
>> +       pmu_read_multi(fd, 2, val);
>> +
>> +       assert_within_epsilon(val[0], 0.0f, tolerance);
>> +       assert_within_epsilon(val[1], 0.0f, tolerance);
>> +
>> +       if (busy)
>> +               igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
> 
> I'm still at a loss as to why this test is interesting. You want to say
> that given an idle engine with a single batch there is no time spent
> waiting for an MI event, nor waiting on a semaphore. Where is that
> guaranteed? What happens if we do need to do such a pre-batch + sync in
> the future. If you wanted to say that whilst the batch is executing that
> no time is spent processing requests the batch isn't making, then maybe.
> But without a test to demonstrate the opposite, we still have no idea if
> the counters work.

Yeah, I am still at a stage of punting off the semaphore work to after 
the rest is polished. Need to nose around some tests and see if I can 
lift some code involving semaphores.

> 
>> +}
>> +
>> +static void
>> +multi_client(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
>> +       igt_spin_t *spin;
>> +       uint64_t val[2];
>> +       int fd[2];
>> +
>> +       fd[0] = open_pmu(config);
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
>> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +
>> +       guaranteed_usleep(batch_duration_ns / 3000);
>> +
>> +       /*
>> +        * Second PMU client which is initialized after the first one,
>> +        * and exists before it, should not affect accounting as reported
>> +        * in the first client.
>> +        */
>> +       fd[1] = open_pmu(config);
>> +       guaranteed_usleep(batch_duration_ns / 3000);
>> +       val[1] = pmu_read_single(fd[1]);
>> +       close(fd[1]);
>> +
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       val[0] = pmu_read_single(fd[0]);
>> +
>> +       assert_within_epsilon(val[0], batch_duration_ns, tolerance);
>> +       assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd[0]);
>> +}
>> +
>> +/**
>> + * Tests that i915 PMU corectly errors out in invalid initialization.
>> + * i915 PMU is uncore PMU, thus:
>> + *  - sampling period is not supported
>> + *  - pid > 0 is not supported since we can't count per-process (we count
>> + *    per whole system)
>> + *  - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0
>> + */
>> +static void invalid_init(void)
>> +{
>> +       struct perf_event_attr attr;
>> +       int pid, cpu;
>> +
>> +#define ATTR_INIT() \
>> +do { \
>> +       memset(&attr, 0, sizeof (attr)); \
>> +       attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \
>> +       attr.type = i915_type_id(); \
>> +       igt_assert(attr.type != 0); \
>> +} while(0)
>> +
>> +       ATTR_INIT();
>> +       attr.sample_period = 100;
>> +       pid = -1;
>> +       cpu = 0;
>> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
>> +       igt_assert_eq(errno, EINVAL);
>> +
>> +       ATTR_INIT();
>> +       pid = 0;
>> +       cpu = 0;
>> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
>> +       igt_assert_eq(errno, EINVAL);
>> +
>> +       ATTR_INIT();
>> +       pid = -1;
>> +       cpu = 1;
>> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
>> +       igt_assert_eq(errno, ENODEV);
>> +}
>> +
>> +static void init_other(unsigned int i, bool valid)
>> +{
>> +       int fd;
>> +
>> +       fd = perf_i915_open(__I915_PMU_OTHER(i));
>> +       igt_require(!(fd < 0 && errno == ENODEV));
> 
> Make perf_i915_open return -errno. It simplifies checks so much.

Hm ok.

> 
>> +       if (valid) {
>> +               igt_assert(fd >= 0);
>> +       } else {
>> +               igt_assert(fd < 0);
>> +               return;
>> +       }
>> +
>> +       close(fd);
>> +}
>> +
>> +static void read_other(unsigned int i, bool valid)
>> +{
>> +       int fd;
>> +
>> +       fd = perf_i915_open(__I915_PMU_OTHER(i));
>> +       igt_require(!(fd < 0 && errno == ENODEV));
>> +       if (valid) {
>> +               igt_assert(fd >= 0);
>> +       } else {
>> +               igt_assert(fd < 0);
>> +               return;
>> +       }
>> +
>> +       (void)pmu_read_single(fd);
>> +
>> +       close(fd);
>> +}
>> +
>> +static bool cpu0_hotplug_support(void)
>> +{
>> +       int fd = open("/sys/devices/system/cpu/cpu0/online", O_WRONLY);
> 
> access("..", W_OK);

Yep, thanks.

> 
>> +
>> +       close(fd);
>> +
>> +       return fd > 0;
>> +}
>> +
>> +static void cpu_hotplug(int gem_fd)
>> +{
>> +       struct timespec start = { };
>> +       igt_spin_t *spin;
>> +       uint64_t val, ref;
>> +       int fd;
>> +
>> +       igt_require(cpu0_hotplug_support());
>> +
>> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
>> +       fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
>> +       igt_assert(fd >= 0);
>> +
>> +       igt_nsec_elapsed(&start);
>> +
>> +       /*
>> +        * Toggle online status of all the CPUs in a child process and ensure
>> +        * this has not affected busyness stats in the parent.
>> +        */
>> +       igt_fork(child, 1) {
>> +               int cpu = 0;
>> +
>> +               for (;;) {
>> +                       char name[128];
>> +                       int cpufd;
>> +
>> +                       sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
>> +                               cpu);
>> +                       cpufd = open(name, O_WRONLY);
>> +                       if (cpufd == -1) {
>> +                               igt_assert(cpu > 0);
>> +                               break;
>> +                       }
>> +                       igt_assert_eq(write(cpufd, "0", 2), 2);
>> +
>> +                       usleep(1000 * 1000);
>> +
>> +                       igt_assert_eq(write(cpufd, "1", 2), 2);
>> +
>> +                       close(cpufd);
>> +                       cpu++;
>> +               }
>> +       }
>> +
>> +       igt_waitchildren();
>> +
>> +       igt_spin_batch_end(spin);
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       ref = igt_nsec_elapsed(&start);
>> +       val = pmu_read_single(fd);
>> +
>> +       assert_within_epsilon(val, ref, tolerance);
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
>> +}
>> +
>> +static int chain_nop(int gem_fd, int in_fence, bool sync)
>> +{
>> +       struct drm_i915_gem_exec_object2 obj = {};
>> +       struct drm_i915_gem_execbuffer2 eb =
>> +               { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
>> +       const uint32_t bbe = 0xa << 23;
>> +
>> +       obj.handle = gem_create(gem_fd, sizeof(bbe));
>> +       gem_write(gem_fd, obj.handle, 0, &bbe, sizeof(bbe));
>> +
>> +       eb.flags = I915_EXEC_RENDER | I915_EXEC_FENCE_OUT;
>> +
>> +       if (in_fence >= 0) {
>> +               eb.flags |= I915_EXEC_FENCE_IN;
>> +               eb.rsvd2 = in_fence;
>> +       }
>> +
>> +       gem_execbuf_wr(gem_fd, &eb);
>> +
>> +       if (sync)
>> +               gem_sync(gem_fd, obj.handle);
>> +
>> +       gem_close(gem_fd, obj.handle);
>> +       if (in_fence >= 0)
>> +               close(in_fence);
>> +
>> +       return eb.rsvd2 >> 32;
>> +}
>> +
>> +static void
>> +test_interrupts(int gem_fd)
>> +{
>> +       uint64_t idle, busy, prev;
>> +       int fd, fence = -1;
>> +       const unsigned int count = 1000;
>> +       unsigned int i;
>> +
>> +       fd = open_pmu(I915_PMU_INTERRUPTS);
>> +
>> +       gem_quiescent_gpu(gem_fd);
>> +
>> +       /* Wait for idle state. */
>> +       prev = pmu_read_single(fd);
>> +       idle = prev + 1;
>> +       while (idle != prev) {
>> +               usleep(batch_duration_ns / 1000);
>> +               prev = idle;
>> +               idle = pmu_read_single(fd);
>> +       }
>> +
>> +       igt_assert_eq(idle - prev, 0);
>> +
>> +       /* Send some no-op batches with chained fences to ensure interrupts. */
> 
> Not all kernels would emit interrupts for this. Remember we used to spin
> on the in-fence to avoid the interrupt setup overhead. And there's no
> reason to assume we wouldn't again if there was a demonstrable
> advantage.
> 
> You've assumed execlists elsewhere, so why not generate context-switch
> interrupts instead? I presume you did try MI_USER_INTERRUPT from a user
> batch? iirc it should still work.

Even if I emit it myself who guarantees i915 is listening to them? So I 
thought it is easier to have i915 emit them for me, and by using fences 
ensuring it is listening.

Could move towards longer batches to defeat any future busyspinning we 
might add?

> 
>> +       for (i = 1; i <= count; i++)
>> +               fence = chain_nop(gem_fd, fence, i < count ? false : true);
>> +       close(fence);
>> +
>> +       /* Check at least as many interrupts has been generated. */
>> +       busy = pmu_read_single(fd);
>> +       igt_assert(busy > count / 20);
>> +
>> +       close(fd);
>> +}
>> +
>> +static void
>> +test_frequency(int gem_fd)
>> +{
>> +       igt_spin_t *spin;
>> +       uint64_t idle[2], busy[2];
>> +       int fd;
>> +
>> +       fd = open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
>> +       open_group(I915_PMU_ACTUAL_FREQUENCY, fd);
>> +
>> +       gem_quiescent_gpu(gem_fd);
>> +       usleep(batch_duration_ns / 1000);
>> +
>> +       pmu_read_multi(fd, 2, idle);
>> +
>> +       /*
>> +        * Put some load on the render engine and check that the requenst
>> +        * and actual frequencies have gone up.
>> +        *
>> +        * FIXME This might not be guaranteed to work on all platforms.
>> +        * How to detect those?
>> +        */
> 
> I don't think we can make any guarantees about freq, except the
> softlimits imposed via sysfs.
> 
> So use that.
> 
> Create a spin batch, set min/max to low. Measure. Set min/max to high, sample
> again. The results should follow the values set via sysfs.

Aha, thanks for the idea!

> 
>> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
>> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
>> +       gem_sync(gem_fd, spin->handle);
>> +
>> +       pmu_read_multi(fd, 2, busy);
>> +
>> +       igt_assert(busy[0] > idle[0]);
>> +       igt_assert(busy[1] > idle[1]);
>> +
>> +       igt_spin_batch_free(gem_fd, spin);
>> +       close(fd);
>> +}
>> +
>> +static void
>> +test_rc6(int gem_fd)
>> +{
>> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
>> +       uint64_t idle, busy, prev;
>> +       int fd, fw;
>> +
>> +       fd = open_pmu(I915_PMU_RC6_RESIDENCY);
>> +
>> +       gem_quiescent_gpu(gem_fd);
>> +
>> +       /* Go idle and check full RC6. */
>> +       prev = pmu_read_single(fd);
>> +       guaranteed_usleep(duration_ns / 1000);
>> +       idle = pmu_read_single(fd);
>> +
>> +       assert_within_epsilon(idle - prev, duration_ns, tolerance);
>> +
>> +       /* Wake up device and check no RC6. */
>> +       fw = igt_open_forcewake_handle(gem_fd);
>> +       igt_assert(fw >= 0);
>> +
>> +       prev = pmu_read_single(fd);
>> +       guaranteed_usleep(duration_ns / 1000);
>> +       busy = pmu_read_single(fd);
>> +
>> +       assert_within_epsilon(busy - prev, 0.0, tolerance);
>> +
>> +       close(fw);
>> +       close(fd);
> 
> Ok (other than the guaranteed vs known sleep).
> 
>> +}
>> +
>> +static void
>> +test_rc6p(int gem_fd)
>> +{
>> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
>> +       unsigned int num_pmu = 1;
>> +       uint64_t idle[3], busy[3], prev[3];
>> +       unsigned int i;
>> +       int fd, ret, fw;
>> +
>> +       fd = open_group(I915_PMU_RC6_RESIDENCY, -1);
>> +       ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
>> +       if (ret > 0) {
>> +               num_pmu++;
>> +               ret = perf_i915_open_group(I915_PMU_RC6pp_RESIDENCY, fd);
>> +               if (ret > 0)
>> +                       num_pmu++;
>> +       }
>> +
>> +       igt_require(num_pmu == 3);
> 
> Do we expose all RC6 counters, even on hw that doesn't support them all?
> We pretty much only expect to have an rc6 counter (from snb+)

At the moment yes. They just report -ENODEV on unsupported platforms. 
Another mental TODO I have is to see how to register counters 
dynamically. But even that would only help with users who bother doin't 
the sysfs discovery.

> 
>> +
>> +       gem_quiescent_gpu(gem_fd);
>> +
>> +       /* Go idle and check full RC6. */
>> +       pmu_read_multi(fd, num_pmu, prev);
>> +       guaranteed_usleep(duration_ns / 1000);
>> +       pmu_read_multi(fd, num_pmu, idle);
>> +
>> +       for (i = 0; i < num_pmu; i++)
>> +               assert_within_epsilon(idle[i] - prev[i], duration_ns,
>> +                                     tolerance);
>> +
>> +       /* Wake up device and check no RC6. */
>> +       fw = igt_open_forcewake_handle(gem_fd);
>> +       igt_assert(fw >= 0);
>> +
>> +       pmu_read_multi(fd, num_pmu, prev);
>> +       guaranteed_usleep(duration_ns / 1000);
>> +       pmu_read_multi(fd, num_pmu, busy);
>> +
>> +       for (i = 0; i < num_pmu; i++)
>> +               assert_within_epsilon(busy[i] - prev[i], 0.0, tolerance);
>> +
>> +       close(fw);
>> +       close(fd);
>> +}
>> +
>> +igt_main
>> +{
>> +       const unsigned int num_other_metrics =
>> +                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
>> +       unsigned int num_engines = 0;
>> +       int fd = -1;
>> +       const struct intel_execution_engine2 *e;
>> +       unsigned int i;
>> +
>> +       igt_fixture {
>> +               fd = drm_open_driver_master(DRIVER_INTEL);
> 
> master? For what? I didn't see any display interactions, or secure
> dispatch. Does this mean that we only have perf counters for the master?

To ensure no one else is fiddling with the GPU while the test is running 
since we want complete control here.

> Shouldn't we also be checking that we can capture rendernodes as well as
> authed fd?

Again, due drm fd and perf having no direct connection I don't see this 
as interesting.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers
  2017-09-25 16:44   ` Chris Wilson
@ 2017-09-26 11:27     ` Tvrtko Ursulin
  2017-09-26 11:48       ` Chris Wilson
  0 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-09-26 11:27 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Intel-gfx


On 25/09/2017 17:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-09-25 16:15:01)
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Add busy and busy-avg balancers which make balancing
>> decisions by looking at engine busyness via the i915 PMU.
> 
> "And thus are able to make decisions on the actual instantaneous load of
> the system, and not use metrics that lag behind by a batch or two. In
> doing so, each client should be able to greedily maximise their own
> usage of the system, leading to improved load balancing even in the face
> of other uncooperative clients. On the other hand, we are only using the
> instantaneous load without coupling in the predictive factor for
> dispatch and execution length."

Ok, thanks for the text.

> Hmm, do you not want to sum busy + queued? Or at least compare! :)

How to add apples and oranges? :) Queued * busy [0.0 - 1.0] ?

>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> ---
>>   benchmarks/gem_wsim.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 140 insertions(+)
>>
>> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
>> index 82fe6ba9ec5f..9ee91fabb220 100644
>> --- a/benchmarks/gem_wsim.c
>> +++ b/benchmarks/gem_wsim.c
>> @@ -50,6 +50,7 @@
>>   #include "intel_io.h"
>>   #include "igt_aux.h"
>>   #include "igt_rand.h"
>> +#include "igt_perf.h"
>>   #include "sw_sync.h"
>>   
>>   #include "ewma.h"
>> @@ -188,6 +189,16 @@ struct workload
>>                          uint32_t last[NUM_ENGINES];
>>                  } rt;
>>          };
>> +
>> +       struct busy_balancer {
>> +               int fd;
>> +               bool first;
>> +               unsigned int num_engines;
>> +               unsigned int engine_map[5];
>> +               uint64_t t_prev;
>> +               uint64_t prev[5];
>> +               double busy[5];
>> +       } busy_balancer;
>>   };
>>   
>>   static const unsigned int nop_calibration_us = 1000;
>> @@ -993,6 +1004,8 @@ struct workload_balancer {
>>          unsigned int flags;
>>          unsigned int min_gen;
>>   
>> +       int (*init)(const struct workload_balancer *balancer,
>> +                   struct workload *wrk);
>>          unsigned int (*get_qd)(const struct workload_balancer *balancer,
>>                                 struct workload *wrk,
>>                                 enum intel_engine_id engine);
>> @@ -1242,6 +1255,106 @@ context_balance(const struct workload_balancer *balancer,
>>          return get_vcs_engine(wrk->ctx_list[w->context].static_vcs);
>>   }
>>   
>> +static unsigned int
>> +get_engine_busy(const struct workload_balancer *balancer,
>> +               struct workload *wrk, enum intel_engine_id engine)
>> +{
>> +       struct busy_balancer *bb = &wrk->busy_balancer;
>> +
>> +       if (engine == VCS2 && (wrk->flags & VCS2REMAP))
>> +               engine = BCS;
>> +
>> +       return bb->busy[bb->engine_map[engine]];
>> +}
>> +
>> +static void get_stats(const struct workload_balancer *b, struct workload *wrk)
> 
> s/get_stats/get_pmu_stats/ ?

Ok.

> 
>> +{
>> +       struct busy_balancer *bb = &wrk->busy_balancer;
>> +       uint64_t val[7];
>> +       unsigned int i;
>> +
>> +       igt_assert_eq(read(bb->fd, val, sizeof(val)), sizeof(val));
>> +
>> +       if (!bb->first) {
>> +               for (i = 0; i < bb->num_engines; i++) {
>> +                       double d;
>> +
>> +                       d = (val[2 + i] - bb->prev[i]) * 100;
>> +                       d /= val[1] - bb->t_prev;
>> +                       bb->busy[i] = d;
>> +               }
>> +       }
>> +
>> +       for (i = 0; i < bb->num_engines; i++)
>> +               bb->prev[i] = val[2 + i];
>> +
>> +       bb->t_prev = val[1];
>> +       bb->first = false;
> 
> Ok.

Although normalizing to [0.0 - 100.0] is only helpful if one wants to 
look at the numbers.

> 
>> +}
>> +
>> +static enum intel_engine_id
>> +busy_avg_balance(const struct workload_balancer *balancer,
>> +                struct workload *wrk, struct w_step *w)
>> +{
>> +       get_stats(balancer, wrk);
>> +
>> +       return qdavg_balance(balancer, wrk, w);
>> +}
>> +
>> +static enum intel_engine_id
>> +busy_balance(const struct workload_balancer *balancer,
>> +            struct workload *wrk, struct w_step *w)
>> +{
>> +       get_stats(balancer, wrk);
>> +
>> +       return qd_balance(balancer, wrk, w);
>> +}
>> +
>> +static int
>> +busy_init(const struct workload_balancer *balancer, struct workload *wrk)
>> +{
>> +       struct busy_balancer *bb = &wrk->busy_balancer;
>> +       struct engine_desc {
>> +               unsigned class, inst;
>> +               enum intel_engine_id id;
>> +       } *d, engines[] = {
>> +               { I915_ENGINE_CLASS_RENDER, 0, RCS },
>> +               { I915_ENGINE_CLASS_COPY, 0, BCS },
>> +               { I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
>> +               { I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
>> +               { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
>> +               { 0, 0, VCS }
>> +       };
>> +
>> +       bb->num_engines = 0;
>> +       bb->first = true;
>> +       bb->fd = -1;
>> +
>> +       for (d = &engines[0]; d->id != VCS; d++) {
>> +               int pfd;
>> +
>> +               pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
>> +                                                               d->inst),
>> +                                          bb->fd);
>> +               if (pfd < 0) {
>> +                       if (d->id != VCS2)
>> +                               return -(10 + bb->num_engines);
>> +                       else
>> +                               continue;
>> +               }
>> +
>> +               if (bb->num_engines == 0)
>> +                       bb->fd = pfd;
>> +
>> +               bb->engine_map[d->id] = bb->num_engines++;
>> +       }
>> +
>> +       if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP))
>> +               return -1;
> 
> Hmm, feels a little sloppy. Would be more concrete if we have a list of
> engines were going to use, and then we either created a perf event for
> each or died.

It is only a theoretical flaw, even though balancer init wo/ VCS2 would 
succeed and would be reporting invalid data for it, the tool would fail 
when trying to submit to VCS2 so no harm done.

> 
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Thanks, I'll see if I will find the time to beautify it wrt/ the above 
at some point.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API
  2017-09-26 11:19     ` Tvrtko Ursulin
@ 2017-09-26 11:42       ` Chris Wilson
  2017-10-09 10:32         ` Tvrtko Ursulin
  0 siblings, 1 reply; 25+ messages in thread
From: Chris Wilson @ 2017-09-26 11:42 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-26 12:19:35)
> 
> On 25/09/2017 17:21, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2017-09-25 16:15:00)
> >> +#define assert_within_epsilon(x, ref, tolerance) \
> >> +       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
> >> +                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
> >> +                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
> >> +                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
> >> +
> >> +/*
> >> + * Helper for cases where we assert on time spent sleeping (directly or
> >> + * indirectly), so make it more robust by ensuring the system sleep time
> >> + * is within test tolerance to start with.
> >> + */
> > 
> > Looking at the callers, I don't see where we need a guaranteed sleep vs
> > a known sleep. We are comparing elapsed/wall time with a perf counter, so
> > we only care that they match. By asserting we just cause random fails if
> > the system decides not to wake up for a 1s (outside the purvey of the
> > test).
> 
> Okay so you suggest measured_usleep instead of guaranteed_usleep. And 
> then the measured time used in the asserts. Makes sense, just please 
> confirm that there hasn't been a misunderstanding.

Yes, measure the sleep then we don't have any ambiguity over the length.
If it ends up being too long for the test to behave, we will know.

> >> +       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
> >> +       for (i = 0; i < num_engines; i++) {
> > 
> > double expected;
> > 
> > if (i == busy_idx)
> >       expected = batch_duration_ns;
> > else
> >       expected = 0;
> > assert_within_epsilon(val[i], expected, tolerance);
> 
> I don't see any advantage but can change it sure.

I just found it a little easier to parse.

> > How about batches executing from a different context, or submitted from
> > a different fd?
> 
> I can't see that adding much value? Am I thinking too much white box? 
> The perf API has nothing to do with the drm fd so I just see it not 
> relevant.

Exactly, why I think we should establish that in the test. In part, this
is because we have the other i915 perf API that is context/fd/etc bound.

The challenge is not to go over board, but atm we always use the gem_fd
that was opened before the perf, so I worry if we may allow some
inadvertent coupling there. Whereas we want to assert that this is a
system profiler, capable of capturing everything (and unfortunately not
yet capable of much filtering :(

> >> +       } else {
> >> +               usleep(batch_duration_ns / 1000);
> >> +       }
> >> +
> >> +       pmu_read_multi(fd, 2, val);
> >> +
> >> +       assert_within_epsilon(val[0], 0.0f, tolerance);
> >> +       assert_within_epsilon(val[1], 0.0f, tolerance);
> >> +
> >> +       if (busy)
> >> +               igt_spin_batch_free(gem_fd, spin);
> >> +       close(fd);
> > 
> > I'm still at a loss as to why this test is interesting. You want to say
> > that given an idle engine with a single batch there is no time spent
> > waiting for an MI event, nor waiting on a semaphore. Where is that
> > guaranteed? What happens if we do need to do such a pre-batch + sync in
> > the future. If you wanted to say that whilst the batch is executing that
> > no time is spent processing requests the batch isn't making, then maybe.
> > But without a test to demonstrate the opposite, we still have no idea if
> > the counters work.
> 
> Yeah, I am still at a stage of punting off the semaphore work to after 
> the rest is polished. Need to nose around some tests and see if I can 
> lift some code involving semaphores.

Bug 54226, it's a classic :) Not a test, but the overlay showed the
breakage quite clearly.

Hmm, we should be able to do WAIT on any platform, since it's just MI
instructions - but the setup may differ between platform. I would use
wait-for-event, rather than scanline.

Hmm, I wonder if MI_SEMAPHORE_WAIT | POLL asserts the RING_CTL
semaphore bit. I hope it does, then not only do we have a simple test,
but the counter is useful for VkEvent.

> >> +static void
> >> +test_interrupts(int gem_fd)
> >> +{
> >> +       uint64_t idle, busy, prev;
> >> +       int fd, fence = -1;
> >> +       const unsigned int count = 1000;
> >> +       unsigned int i;
> >> +
> >> +       fd = open_pmu(I915_PMU_INTERRUPTS);
> >> +
> >> +       gem_quiescent_gpu(gem_fd);
> >> +
> >> +       /* Wait for idle state. */
> >> +       prev = pmu_read_single(fd);
> >> +       idle = prev + 1;
> >> +       while (idle != prev) {
> >> +               usleep(batch_duration_ns / 1000);
> >> +               prev = idle;
> >> +               idle = pmu_read_single(fd);
> >> +       }
> >> +
> >> +       igt_assert_eq(idle - prev, 0);
> >> +
> >> +       /* Send some no-op batches with chained fences to ensure interrupts. */
> > 
> > Not all kernels would emit interrupts for this. Remember we used to spin
> > on the in-fence to avoid the interrupt setup overhead. And there's no
> > reason to assume we wouldn't again if there was a demonstrable
> > advantage.
> > 
> > You've assumed execlists elsewhere, so why not generate context-switch
> > interrupts instead? I presume you did try MI_USER_INTERRUPT from a user
> > batch? iirc it should still work.
> 
> Even if I emit it myself who guarantees i915 is listening to them? So I 
> thought it is easier to have i915 emit them for me, and by using fences 
> ensuring it is listening.

Hmm. True.
> 
> Could move towards longer batches to defeat any future busyspinning we 
> might add?

They wanted 500us! Next, they'll probably ask for 500ms!

I wonder if we could inject an interrupt from debugfs. Ok, it doesn't
have the nicety that we know we are reporting hw interrupts, but it will
confirm that we are reporting interrupts.

~o~

(Injecting interrupts is something I want to do for selftests, and I
think I read that it is possible...)

> >> +igt_main
> >> +{
> >> +       const unsigned int num_other_metrics =
> >> +                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
> >> +       unsigned int num_engines = 0;
> >> +       int fd = -1;
> >> +       const struct intel_execution_engine2 *e;
> >> +       unsigned int i;
> >> +
> >> +       igt_fixture {
> >> +               fd = drm_open_driver_master(DRIVER_INTEL);
> > 
> > master? For what? I didn't see any display interactions, or secure
> > dispatch. Does this mean that we only have perf counters for the master?
> 
> To ensure no one else is fiddling with the GPU while the test is running 
> since we want complete control here.

But that's not master, that's exclusive. Huge difference in semantics
:-p

> > Shouldn't we also be checking that we can capture rendernodes as well as
> > authed fd?
> 
> Again, due drm fd and perf having no direct connection I don't see this 
> as interesting.

Until we establish that there is no connection, I feel it is a vital
property of the profiler that it is able to work with any client.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers
  2017-09-26 11:27     ` Tvrtko Ursulin
@ 2017-09-26 11:48       ` Chris Wilson
  0 siblings, 0 replies; 25+ messages in thread
From: Chris Wilson @ 2017-09-26 11:48 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-09-26 12:27:33)
> 
> On 25/09/2017 17:44, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2017-09-25 16:15:01)
> >> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >> Add busy and busy-avg balancers which make balancing
> >> decisions by looking at engine busyness via the i915 PMU.
> > 
> > "And thus are able to make decisions on the actual instantaneous load of
> > the system, and not use metrics that lag behind by a batch or two. In
> > doing so, each client should be able to greedily maximise their own
> > usage of the system, leading to improved load balancing even in the face
> > of other uncooperative clients. On the other hand, we are only using the
> > instantaneous load without coupling in the predictive factor for
> > dispatch and execution length."
> 
> Ok, thanks for the text.
> 
> > Hmm, do you not want to sum busy + queued? Or at least compare! :)
> 
> How to add apples and oranges? :) Queued * busy [0.0 - 1.0] ?

Just switch bases to bananas.

Hmm, I keep going back to that we do was a loadavg type of stat.
Something that tells us the current (and historical) run length. Kind of
ties into what we want to measure through QUEUED.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API
  2017-09-26 11:42       ` Chris Wilson
@ 2017-10-09 10:32         ` Tvrtko Ursulin
  2017-10-09 10:55           ` Chris Wilson
  0 siblings, 1 reply; 25+ messages in thread
From: Tvrtko Ursulin @ 2017-10-09 10:32 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Intel-gfx


On 26/09/2017 12:42, Chris Wilson wrote:

[snip]

>>> I'm still at a loss as to why this test is interesting. You want to say
>>> that given an idle engine with a single batch there is no time spent
>>> waiting for an MI event, nor waiting on a semaphore. Where is that
>>> guaranteed? What happens if we do need to do such a pre-batch + sync in
>>> the future. If you wanted to say that whilst the batch is executing that
>>> no time is spent processing requests the batch isn't making, then maybe.
>>> But without a test to demonstrate the opposite, we still have no idea if
>>> the counters work.
>>
>> Yeah, I am still at a stage of punting off the semaphore work to after
>> the rest is polished. Need to nose around some tests and see if I can
>> lift some code involving semaphores.
> 
> Bug 54226, it's a classic :) Not a test, but the overlay showed the
> breakage quite clearly.
> 
> Hmm, we should be able to do WAIT on any platform, since it's just MI
> instructions - but the setup may differ between platform. I would use
> wait-for-event, rather than scanline.

I cannot get this to work. Looking at the docs the wait for vblank 
looked like a good candidate to me.

So I am creating a batch with MI_WAIT_FOR_EVENT | 
MI_WAIT_FOR_PIPE_A_VBLANK. Pipe A is confirmed active and with a mode. 
But the command seems to hang there. :(

Ironically PMU does report the correct time spent in wait once the GPU 
reset kicks the client off.

Any ideas?

> Hmm, I wonder if MI_SEMAPHORE_WAIT | POLL asserts the RING_CTL
> semaphore bit. I hope it does, then not only do we have a simple test,
> but the counter is useful for VkEvent.

This seems to work. I am waiting on a value in a shared bo, and then 
update the value via a WC mmap. Engine proceeds and I check the PMU. All 
good.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API
  2017-10-09 10:32         ` Tvrtko Ursulin
@ 2017-10-09 10:55           ` Chris Wilson
  0 siblings, 0 replies; 25+ messages in thread
From: Chris Wilson @ 2017-10-09 10:55 UTC (permalink / raw)
  To: Tvrtko Ursulin, Tvrtko Ursulin, Intel-gfx

Quoting Tvrtko Ursulin (2017-10-09 11:32:21)
> 
> On 26/09/2017 12:42, Chris Wilson wrote:
> 
> [snip]
> 
> >>> I'm still at a loss as to why this test is interesting. You want to say
> >>> that given an idle engine with a single batch there is no time spent
> >>> waiting for an MI event, nor waiting on a semaphore. Where is that
> >>> guaranteed? What happens if we do need to do such a pre-batch + sync in
> >>> the future. If you wanted to say that whilst the batch is executing that
> >>> no time is spent processing requests the batch isn't making, then maybe.
> >>> But without a test to demonstrate the opposite, we still have no idea if
> >>> the counters work.
> >>
> >> Yeah, I am still at a stage of punting off the semaphore work to after
> >> the rest is polished. Need to nose around some tests and see if I can
> >> lift some code involving semaphores.
> > 
> > Bug 54226, it's a classic :) Not a test, but the overlay showed the
> > breakage quite clearly.
> > 
> > Hmm, we should be able to do WAIT on any platform, since it's just MI
> > instructions - but the setup may differ between platform. I would use
> > wait-for-event, rather than scanline.
> 
> I cannot get this to work. Looking at the docs the wait for vblank 
> looked like a good candidate to me.
> 
> So I am creating a batch with MI_WAIT_FOR_EVENT | 
> MI_WAIT_FOR_PIPE_A_VBLANK. Pipe A is confirmed active and with a mode. 
> But the command seems to hang there. :(
> 
> Ironically PMU does report the correct time spent in wait once the GPU 
> reset kicks the client off.
> 
> Any ideas?

Did you queue a vblank (drmWaitVblank)? We don't enable the vblank irq
until we need it, either for a modeset or vblank-wait. drmWaitVblank()
is more explicit in that regard.

You also will need to unmask the event in DERRMR and acquire fw.
Something like:

	MI_LOAD_REGISTER_IMM | 3;
	0x44050; /* DERRMR */
	~event;
	0xa188; /* FORCEWAKE_MT */
	2 << 16 | 2; /* BIT(1) == userspace */

and undo afterwards.

> > Hmm, I wonder if MI_SEMAPHORE_WAIT | POLL asserts the RING_CTL
> > semaphore bit. I hope it does, then not only do we have a simple test,
> > but the counter is useful for VkEvent.
> 
> This seems to work. I am waiting on a value in a shared bo, and then 
> update the value via a WC mmap. Engine proceeds and I check the PMU. All 
> good.

That's good news! Since learning about VkEvent using MI_SEMAPHORE_WAIT
form userspace, it's been on my worrylist. Now we should be able to
measure if it becomes a problem.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2017-10-09 10:55 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-25 15:14 [PATCH i-g-t v2 0/7] IGT PMU support Tvrtko Ursulin
2017-09-25 15:14 ` [PATCH i-g-t 1/7] intel-gpu-overlay: Move local perf implementation to a library Tvrtko Ursulin
2017-09-25 15:22   ` Chris Wilson
2017-09-26 10:52     ` Tvrtko Ursulin
2017-09-26 11:07       ` Chris Wilson
2017-09-25 15:14 ` [PATCH i-g-t 2/7] intel-gpu-overlay: Consolidate perf PMU access to library Tvrtko Ursulin
2017-09-25 15:25   ` Chris Wilson
2017-09-25 15:14 ` [PATCH i-g-t 3/7] intel-gpu-overlay: Fix interrupts PMU readout Tvrtko Ursulin
2017-09-25 15:14 ` [PATCH i-g-t 4/7] intel-gpu-overlay: Catch-up to new i915 PMU Tvrtko Ursulin
2017-09-25 15:31   ` Chris Wilson
2017-09-26 10:56     ` Tvrtko Ursulin
2017-09-26 11:13       ` Chris Wilson
2017-09-25 15:15 ` [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API Tvrtko Ursulin
2017-09-25 16:21   ` Chris Wilson
2017-09-26 11:19     ` Tvrtko Ursulin
2017-09-26 11:42       ` Chris Wilson
2017-10-09 10:32         ` Tvrtko Ursulin
2017-10-09 10:55           ` Chris Wilson
2017-09-25 15:15 ` [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers Tvrtko Ursulin
2017-09-25 16:44   ` Chris Wilson
2017-09-26 11:27     ` Tvrtko Ursulin
2017-09-26 11:48       ` Chris Wilson
2017-09-25 15:15 ` [PATCH i-g-t 7/7] media-bench.pl: Add busy balancers to the list Tvrtko Ursulin
2017-09-25 16:06 ` ✓ Fi.CI.BAT: success for IGT PMU support (rev4) Patchwork
2017-09-25 22:16 ` ✓ Fi.CI.IGT: " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.