All of lore.kernel.org
 help / color / mirror / Atom feed
* [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
@ 2022-07-13 10:53 Karolina Drobnik
  2022-07-15 17:30 ` Kamil Konieczny
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-13 10:53 UTC (permalink / raw)
  To: igt-dev; +Cc: Tvrtko Ursulin, Chris Wilson

From: Chris Wilson <chris@chris-wilson.co.uk>

Currently the wait boost heuristic is evaluated at the start of each
fence wait for a series within dma-resv. There is no strict ordering of
fences defined by dma-resv, and so it turns out that the same operation
under different circumstances can result in different heuristics being
applied, and dramatic performance variations in user applications.

Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
---
v4:
  - Added test descriptions with igt_describe()
v3:
  - Removed unnecessary calls to gem_write
v2:
  - Introduced batch_create() with arbitration points to prevent hangs
  - Added engine-order subtest
  - Added reporting of GPU frequency
  - Changed runtimes and switched prints from us to ms

 tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 255 insertions(+), 6 deletions(-)

diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
index b37f15c2..6125ea60 100644
--- a/tests/i915/i915_pm_rps.c
+++ b/tests/i915/i915_pm_rps.c
@@ -37,8 +37,10 @@
 #include <sys/wait.h>

 #include "i915/gem.h"
+#include "i915/gem_create.h"
 #include "igt.h"
 #include "igt_dummyload.h"
+#include "igt_perf.h"
 #include "igt_sysfs.h"

 IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
@@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
 	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
 }

+static uint32_t batch_create(int i915, uint64_t sz)
+{
+	const uint32_t bbe = MI_BATCH_BUFFER_END;
+	const uint32_t chk = 0x5 << 23;
+	uint32_t handle = gem_create(i915, sz);
+
+	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
+		gem_write(i915, handle, pg, &chk, sizeof(chk));
+
+	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
+
+	return handle;
+}
+
+static uint64_t __fence_order(int i915,
+			      struct drm_i915_gem_exec_object2 *obj,
+			      struct drm_i915_gem_execbuffer2 *eb,
+			      uint64_t flags0, uint64_t flags1,
+			      double *outf)
+{
+	uint64_t before[2], after[2];
+	struct timespec tv;
+	int fd;
+
+	gem_quiescent_gpu(i915);
+	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
+
+	igt_gettime(&tv);
+
+	obj->flags = flags0;
+	gem_execbuf(i915, eb);
+
+	obj->flags = flags1;
+	gem_execbuf(i915, eb);
+
+	read(fd, before, sizeof(before));
+	gem_sync(i915, obj->handle);
+	read(fd, after, sizeof(after));
+	close(fd);
+
+	after[0] -= before[0];
+	after[1] -= before[1];
+
+	*outf = 1e9 * after[0] / after[1];
+	return igt_nsec_elapsed(&tv);
+}
+
+static void fence_order(int i915)
+{
+	const uint64_t sz = 512ull << 20;
+	struct drm_i915_gem_exec_object2 obj[2] = {
+		{ .handle = gem_create(i915, 4096) },
+		{ .handle = batch_create(i915, sz) },
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(obj),
+		.buffer_count = ARRAY_SIZE(obj),
+	};
+	uint64_t wr, rw;
+	int min, max;
+	double freq;
+	int sysfs;
+
+	/*
+	 * Check the order of fences found during GEM_WAIT does not affect
+	 * waitboosting.
+	 *
+	 * Internally, implicit fences are tracked within a dma-resv which
+	 * imposes no order on the individually fences tracked within. Since
+	 * there is no defined order, the sequence of waits (and the associated
+	 * waitboosts) is also undefined, undermining the consistency of the
+	 * waitboost heuristic.
+	 *
+	 * In particular, we can influence the sequence of fence storage
+	 * within dma-resv by mixing read/write semantics for implicit fences.
+	 * We can exploit this property of dma-resv to exercise that no matter
+	 * the stored order, the heuristic is applied consistently for the
+	 * user's GEM_WAIT ioctl.
+	 */
+
+	sysfs = igt_sysfs_open(i915);
+	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
+	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
+	igt_require(max > min);
+
+	/* Only allow ourselves to upclock via waitboosting */
+	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
+	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
+
+	/* Warm up to bind the vma */
+	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
+
+	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
+	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
+
+	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
+	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
+
+	gem_close(i915, obj[0].handle);
+	gem_close(i915, obj[1].handle);
+
+	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
+
+	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
+}
+
+static uint64_t __engine_order(int i915,
+			       struct drm_i915_gem_exec_object2 *obj,
+			       struct drm_i915_gem_execbuffer2 *eb,
+			       unsigned int *engines0,
+			       unsigned int *engines1,
+			       unsigned int num_engines,
+			       double *outf)
+{
+	uint64_t before[2], after[2];
+	struct timespec tv;
+	int fd;
+
+	gem_quiescent_gpu(i915);
+	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
+
+	igt_gettime(&tv);
+
+	obj->flags = EXEC_OBJECT_WRITE;
+	for (unsigned int n = 0; n < num_engines; n++) {
+		eb->flags &= ~63ull;
+		eb->flags |= engines0[n];
+		gem_execbuf_wr(i915, eb);
+	}
+
+	obj->flags = 0;
+	for (unsigned int n = 0; n < num_engines; n++) {
+		eb->flags &= ~63ull;
+		eb->flags |= engines1[n];
+		gem_execbuf(i915, eb);
+	}
+
+	read(fd, before, sizeof(before));
+	gem_sync(i915, obj->handle);
+	read(fd, after, sizeof(after));
+	close(fd);
+
+	after[0] -= before[0];
+	after[1] -= before[1];
+
+	*outf = 1e9 * after[0] / after[1];
+	return igt_nsec_elapsed(&tv);
+}
+
+static void engine_order(int i915)
+{
+	const uint64_t sz = 512ull << 20;
+	struct drm_i915_gem_exec_object2 obj[2] = {
+		{ .handle = gem_create(i915, 4096) },
+		{ .handle = batch_create(i915, sz) },
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(obj),
+		.buffer_count = ARRAY_SIZE(obj),
+	};
+	const struct intel_execution_engine2 *e;
+	unsigned int engines[2], reverse[2];
+	uint64_t forward, backward, both;
+	unsigned int num_engines;
+	const intel_ctx_t *ctx;
+	int min, max;
+	double freq;
+	int sysfs;
+
+	/*
+	 * Check the order of fences found during GEM_WAIT does not affect
+	 * waitboosting. (See fence_order())
+	 *
+	 * Another way we can manipulate the order of fences within the
+	 * dma-resv is through repeated use of the same contexts.
+	 */
+
+	num_engines = 0;
+	ctx = intel_ctx_create_all_physical(i915);
+	for_each_ctx_engine(i915, ctx, e) {
+		/*
+		 * Avoid using the cmdparser as it will try to allocate
+		 * a new shadow batch for each submission -> oom
+		 */
+		if (!gem_engine_has_mutable_submission(i915, e->class))
+			continue;
+
+		engines[num_engines++] = e->flags;
+		if (num_engines == ARRAY_SIZE(engines))
+			break;
+	}
+	igt_require(num_engines > 1);
+	for (unsigned int n = 0; n < num_engines; n++)
+		reverse[n] = engines[num_engines - n - 1];
+	execbuf.rsvd1 = ctx->id;
+
+	sysfs = igt_sysfs_open(i915);
+	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
+	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
+	igt_require(max > min);
+
+	/* Only allow ourselves to upclock via waitboosting */
+	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
+	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
+
+	/* Warm up to bind the vma */
+	gem_execbuf(i915, &execbuf);
+
+	forward = __engine_order(i915, &obj[0], &execbuf,
+				 engines, engines, num_engines,
+				 &freq);
+	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
+
+	backward = __engine_order(i915, &obj[0], &execbuf,
+				  reverse, reverse, num_engines,
+				  &freq);
+	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
+
+	both = __engine_order(i915, &obj[0], &execbuf,
+			      engines, reverse, num_engines,
+			      &freq);
+	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
+
+	gem_close(i915, obj[0].handle);
+	gem_close(i915, obj[1].handle);
+	intel_ctx_destroy(i915, ctx);
+
+	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
+	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
+
+	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
+	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
+}
+
 static void pm_rps_exit_handler(int sig)
 {
-	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
-		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
-		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
-	} else {
-		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
-		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
+	if (sysfs_files[MAX].filp) {
+		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
+			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
+			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
+		} else {
+			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
+			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
+		}
 	}

 	if (lh.igt_proc.running)
@@ -683,6 +924,14 @@ igt_main
 	igt_subtest("waitboost")
 		waitboost(drm_fd, false);

+	igt_describe("Check if the order of fences does not affect waitboosting");
+	igt_subtest("fence-order")
+		fence_order(drm_fd);
+
+	igt_describe("Check if context reuse does not affect waitboosting");
+	igt_subtest("engine-order")
+		engine_order(drm_fd);
+
 	/* Test boost frequency after GPU reset */
 	igt_subtest("reset") {
 		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
--
2.25.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
  2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
@ 2022-07-15 17:30 ` Kamil Konieczny
  2022-07-18  9:46   ` Karolina Drobnik
  2022-07-26 10:49 ` Kamil Konieczny
  2022-07-26 14:18 ` Kamil Konieczny
  2 siblings, 1 reply; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-15 17:30 UTC (permalink / raw)
  To: igt-dev; +Cc: Chris Wilson

Hi,

On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Currently the wait boost heuristic is evaluated at the start of each
> fence wait for a series within dma-resv. There is no strict ordering of
> fences defined by dma-resv, and so it turns out that the same operation
> under different circumstances can result in different heuristics being
> applied, and dramatic performance variations in user applications.
> 
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> ---
> v4:
>   - Added test descriptions with igt_describe()
> v3:
>   - Removed unnecessary calls to gem_write
> v2:
>   - Introduced batch_create() with arbitration points to prevent hangs
>   - Added engine-order subtest
>   - Added reporting of GPU frequency
>   - Changed runtimes and switched prints from us to ms
> 
>  tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 255 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> index b37f15c2..6125ea60 100644
> --- a/tests/i915/i915_pm_rps.c
> +++ b/tests/i915/i915_pm_rps.c
> @@ -37,8 +37,10 @@
>  #include <sys/wait.h>
> 
>  #include "i915/gem.h"
> +#include "i915/gem_create.h"
>  #include "igt.h"
>  #include "igt_dummyload.h"
> +#include "igt_perf.h"
>  #include "igt_sysfs.h"
> 
>  IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>  	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>  }
> 
> +static uint32_t batch_create(int i915, uint64_t sz)
> +{
> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
> +	const uint32_t chk = 0x5 << 23;
> +	uint32_t handle = gem_create(i915, sz);
> +
> +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> +		gem_write(i915, handle, pg, &chk, sizeof(chk));
> +
> +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> +
> +	return handle;
> +}
> +
> +static uint64_t __fence_order(int i915,
> +			      struct drm_i915_gem_exec_object2 *obj,
> +			      struct drm_i915_gem_execbuffer2 *eb,
> +			      uint64_t flags0, uint64_t flags1,
> +			      double *outf)
> +{
> +	uint64_t before[2], after[2];
> +	struct timespec tv;
> +	int fd;
> +
> +	gem_quiescent_gpu(i915);
> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> +	igt_gettime(&tv);
> +
> +	obj->flags = flags0;
> +	gem_execbuf(i915, eb);
> +
> +	obj->flags = flags1;
> +	gem_execbuf(i915, eb);
> +
> +	read(fd, before, sizeof(before));
> +	gem_sync(i915, obj->handle);
> +	read(fd, after, sizeof(after));

imho here is better place to read elapsed time.

> +	close(fd);
> +
> +	after[0] -= before[0];
> +	after[1] -= before[1];
> +
> +	*outf = 1e9 * after[0] / after[1];
> +	return igt_nsec_elapsed(&tv);

See comment above.

> +}
> +
> +static void fence_order(int i915)
> +{
> +	const uint64_t sz = 512ull << 20;
> +	struct drm_i915_gem_exec_object2 obj[2] = {
> +		{ .handle = gem_create(i915, 4096) },
> +		{ .handle = batch_create(i915, sz) },
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(obj),
> +		.buffer_count = ARRAY_SIZE(obj),
> +	};
> +	uint64_t wr, rw;
> +	int min, max;
> +	double freq;
> +	int sysfs;
> +
> +	/*
> +	 * Check the order of fences found during GEM_WAIT does not affect
> +	 * waitboosting.
> +	 *
> +	 * Internally, implicit fences are tracked within a dma-resv which
> +	 * imposes no order on the individually fences tracked within. Since
> +	 * there is no defined order, the sequence of waits (and the associated
> +	 * waitboosts) is also undefined, undermining the consistency of the
> +	 * waitboost heuristic.
> +	 *
> +	 * In particular, we can influence the sequence of fence storage
> +	 * within dma-resv by mixing read/write semantics for implicit fences.
> +	 * We can exploit this property of dma-resv to exercise that no matter
> +	 * the stored order, the heuristic is applied consistently for the
> +	 * user's GEM_WAIT ioctl.
> +	 */
> +
> +	sysfs = igt_sysfs_open(i915);
> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> +	igt_require(max > min);
> +
> +	/* Only allow ourselves to upclock via waitboosting */
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> +	/* Warm up to bind the vma */
> +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> +
> +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);

imho this should be igt_debug (here and below)
s/igt_print/igt_debug/

Regards,
Kamil

> +
> +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> +
> +	gem_close(i915, obj[0].handle);
> +	gem_close(i915, obj[1].handle);
> +
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> +}
> +
> +static uint64_t __engine_order(int i915,
> +			       struct drm_i915_gem_exec_object2 *obj,
> +			       struct drm_i915_gem_execbuffer2 *eb,
> +			       unsigned int *engines0,
> +			       unsigned int *engines1,
> +			       unsigned int num_engines,
> +			       double *outf)
> +{
> +	uint64_t before[2], after[2];
> +	struct timespec tv;
> +	int fd;
> +
> +	gem_quiescent_gpu(i915);
> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> +	igt_gettime(&tv);
> +
> +	obj->flags = EXEC_OBJECT_WRITE;
> +	for (unsigned int n = 0; n < num_engines; n++) {
> +		eb->flags &= ~63ull;
> +		eb->flags |= engines0[n];
> +		gem_execbuf_wr(i915, eb);
> +	}
> +
> +	obj->flags = 0;
> +	for (unsigned int n = 0; n < num_engines; n++) {
> +		eb->flags &= ~63ull;
> +		eb->flags |= engines1[n];
> +		gem_execbuf(i915, eb);
> +	}
> +
> +	read(fd, before, sizeof(before));
> +	gem_sync(i915, obj->handle);
> +	read(fd, after, sizeof(after));
> +	close(fd);
> +
> +	after[0] -= before[0];
> +	after[1] -= before[1];
> +
> +	*outf = 1e9 * after[0] / after[1];
> +	return igt_nsec_elapsed(&tv);
> +}
> +
> +static void engine_order(int i915)
> +{
> +	const uint64_t sz = 512ull << 20;
> +	struct drm_i915_gem_exec_object2 obj[2] = {
> +		{ .handle = gem_create(i915, 4096) },
> +		{ .handle = batch_create(i915, sz) },
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(obj),
> +		.buffer_count = ARRAY_SIZE(obj),
> +	};
> +	const struct intel_execution_engine2 *e;
> +	unsigned int engines[2], reverse[2];
> +	uint64_t forward, backward, both;
> +	unsigned int num_engines;
> +	const intel_ctx_t *ctx;
> +	int min, max;
> +	double freq;
> +	int sysfs;
> +
> +	/*
> +	 * Check the order of fences found during GEM_WAIT does not affect
> +	 * waitboosting. (See fence_order())
> +	 *
> +	 * Another way we can manipulate the order of fences within the
> +	 * dma-resv is through repeated use of the same contexts.
> +	 */
> +
> +	num_engines = 0;
> +	ctx = intel_ctx_create_all_physical(i915);
> +	for_each_ctx_engine(i915, ctx, e) {
> +		/*
> +		 * Avoid using the cmdparser as it will try to allocate
> +		 * a new shadow batch for each submission -> oom
> +		 */
> +		if (!gem_engine_has_mutable_submission(i915, e->class))
> +			continue;
> +
> +		engines[num_engines++] = e->flags;
> +		if (num_engines == ARRAY_SIZE(engines))
> +			break;
> +	}
> +	igt_require(num_engines > 1);
> +	for (unsigned int n = 0; n < num_engines; n++)
> +		reverse[n] = engines[num_engines - n - 1];
> +	execbuf.rsvd1 = ctx->id;
> +
> +	sysfs = igt_sysfs_open(i915);
> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> +	igt_require(max > min);
> +
> +	/* Only allow ourselves to upclock via waitboosting */
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> +	/* Warm up to bind the vma */
> +	gem_execbuf(i915, &execbuf);
> +
> +	forward = __engine_order(i915, &obj[0], &execbuf,
> +				 engines, engines, num_engines,
> +				 &freq);
> +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> +
> +	backward = __engine_order(i915, &obj[0], &execbuf,
> +				  reverse, reverse, num_engines,
> +				  &freq);
> +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> +
> +	both = __engine_order(i915, &obj[0], &execbuf,
> +			      engines, reverse, num_engines,
> +			      &freq);
> +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> +
> +	gem_close(i915, obj[0].handle);
> +	gem_close(i915, obj[1].handle);
> +	intel_ctx_destroy(i915, ctx);
> +
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> +}
> +
>  static void pm_rps_exit_handler(int sig)
>  {
> -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> -	} else {
> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +	if (sysfs_files[MAX].filp) {
> +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> +		} else {
> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +		}
>  	}
> 
>  	if (lh.igt_proc.running)
> @@ -683,6 +924,14 @@ igt_main
>  	igt_subtest("waitboost")
>  		waitboost(drm_fd, false);
> 
> +	igt_describe("Check if the order of fences does not affect waitboosting");
> +	igt_subtest("fence-order")
> +		fence_order(drm_fd);
> +
> +	igt_describe("Check if context reuse does not affect waitboosting");
> +	igt_subtest("engine-order")
> +		engine_order(drm_fd);
> +
>  	/* Test boost frequency after GPU reset */
>  	igt_subtest("reset") {
>  		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
  2022-07-15 17:30 ` Kamil Konieczny
@ 2022-07-18  9:46   ` Karolina Drobnik
  2022-07-25  7:41     ` Karolina Drobnik
  2022-07-25 17:06     ` Kamil Konieczny
  0 siblings, 2 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-18  9:46 UTC (permalink / raw)
  To: Kamil Konieczny; +Cc: igt-dev, Chris Wilson

Hi,

On 15.07.2022 19:30, Kamil Konieczny wrote:
> Hi,
> 
> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> Currently the wait boost heuristic is evaluated at the start of each
>> fence wait for a series within dma-resv. There is no strict ordering of
>> fences defined by dma-resv, and so it turns out that the same operation
>> under different circumstances can result in different heuristics being
>> applied, and dramatic performance variations in user applications.
>>
>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>> ---
>> v4:
>>    - Added test descriptions with igt_describe()
>> v3:
>>    - Removed unnecessary calls to gem_write
>> v2:
>>    - Introduced batch_create() with arbitration points to prevent hangs
>>    - Added engine-order subtest
>>    - Added reporting of GPU frequency
>>    - Changed runtimes and switched prints from us to ms
>>
>>   tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 255 insertions(+), 6 deletions(-)
>>
>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>> index b37f15c2..6125ea60 100644
>> --- a/tests/i915/i915_pm_rps.c
>> +++ b/tests/i915/i915_pm_rps.c
>> @@ -37,8 +37,10 @@
>>   #include <sys/wait.h>
>>
>>   #include "i915/gem.h"
>> +#include "i915/gem_create.h"
>>   #include "igt.h"
>>   #include "igt_dummyload.h"
>> +#include "igt_perf.h"
>>   #include "igt_sysfs.h"
>>
>>   IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>>   	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>>   }
>>
>> +static uint32_t batch_create(int i915, uint64_t sz)
>> +{
>> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
>> +	const uint32_t chk = 0x5 << 23;
>> +	uint32_t handle = gem_create(i915, sz);
>> +
>> +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>> +		gem_write(i915, handle, pg, &chk, sizeof(chk));
>> +
>> +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>> +
>> +	return handle;
>> +}
>> +
>> +static uint64_t __fence_order(int i915,
>> +			      struct drm_i915_gem_exec_object2 *obj,
>> +			      struct drm_i915_gem_execbuffer2 *eb,
>> +			      uint64_t flags0, uint64_t flags1,
>> +			      double *outf)
>> +{
>> +	uint64_t before[2], after[2];
>> +	struct timespec tv;
>> +	int fd;
>> +
>> +	gem_quiescent_gpu(i915);
>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> +	igt_gettime(&tv);
>> +
>> +	obj->flags = flags0;
>> +	gem_execbuf(i915, eb);
>> +
>> +	obj->flags = flags1;
>> +	gem_execbuf(i915, eb);
>> +
>> +	read(fd, before, sizeof(before));
>> +	gem_sync(i915, obj->handle);
>> +	read(fd, after, sizeof(after));
> 
> imho here is better place to read elapsed time.

We would like to measure time of the whole operation, so this is correct 
as it is now

>> +	close(fd);
>> +
>> +	after[0] -= before[0];
>> +	after[1] -= before[1];
>> +
>> +	*outf = 1e9 * after[0] / after[1];
>> +	return igt_nsec_elapsed(&tv);
> 
> See comment above.
> 
>> +}
>> +
>> +static void fence_order(int i915)
>> +{
>> +	const uint64_t sz = 512ull << 20;
>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>> +		{ .handle = gem_create(i915, 4096) },
>> +		{ .handle = batch_create(i915, sz) },
>> +	};
>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>> +		.buffers_ptr = to_user_pointer(obj),
>> +		.buffer_count = ARRAY_SIZE(obj),
>> +	};
>> +	uint64_t wr, rw;
>> +	int min, max;
>> +	double freq;
>> +	int sysfs;
>> +
>> +	/*
>> +	 * Check the order of fences found during GEM_WAIT does not affect
>> +	 * waitboosting.
>> +	 *
>> +	 * Internally, implicit fences are tracked within a dma-resv which
>> +	 * imposes no order on the individually fences tracked within. Since
>> +	 * there is no defined order, the sequence of waits (and the associated
>> +	 * waitboosts) is also undefined, undermining the consistency of the
>> +	 * waitboost heuristic.
>> +	 *
>> +	 * In particular, we can influence the sequence of fence storage
>> +	 * within dma-resv by mixing read/write semantics for implicit fences.
>> +	 * We can exploit this property of dma-resv to exercise that no matter
>> +	 * the stored order, the heuristic is applied consistently for the
>> +	 * user's GEM_WAIT ioctl.
>> +	 */
>> +
>> +	sysfs = igt_sysfs_open(i915);
>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> +	igt_require(max > min);
>> +
>> +	/* Only allow ourselves to upclock via waitboosting */
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> +	/* Warm up to bind the vma */
>> +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>> +
>> +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
>> +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> 
> imho this should be igt_debug (here and below)
> s/igt_print/igt_debug/

This information is essential in verifying the test results, so igt_info 
is required

All the best,
Karolina

> Regards,
> Kamil
> 
>> +
>> +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
>> +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>> +
>> +	gem_close(i915, obj[0].handle);
>> +	gem_close(i915, obj[1].handle);
>> +
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>> +
>> +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>> +}
>> +
>> +static uint64_t __engine_order(int i915,
>> +			       struct drm_i915_gem_exec_object2 *obj,
>> +			       struct drm_i915_gem_execbuffer2 *eb,
>> +			       unsigned int *engines0,
>> +			       unsigned int *engines1,
>> +			       unsigned int num_engines,
>> +			       double *outf)
>> +{
>> +	uint64_t before[2], after[2];
>> +	struct timespec tv;
>> +	int fd;
>> +
>> +	gem_quiescent_gpu(i915);
>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> +	igt_gettime(&tv);
>> +
>> +	obj->flags = EXEC_OBJECT_WRITE;
>> +	for (unsigned int n = 0; n < num_engines; n++) {
>> +		eb->flags &= ~63ull;
>> +		eb->flags |= engines0[n];
>> +		gem_execbuf_wr(i915, eb);
>> +	}
>> +
>> +	obj->flags = 0;
>> +	for (unsigned int n = 0; n < num_engines; n++) {
>> +		eb->flags &= ~63ull;
>> +		eb->flags |= engines1[n];
>> +		gem_execbuf(i915, eb);
>> +	}
>> +
>> +	read(fd, before, sizeof(before));
>> +	gem_sync(i915, obj->handle);
>> +	read(fd, after, sizeof(after));
>> +	close(fd);
>> +
>> +	after[0] -= before[0];
>> +	after[1] -= before[1];
>> +
>> +	*outf = 1e9 * after[0] / after[1];
>> +	return igt_nsec_elapsed(&tv);
>> +}
>> +
>> +static void engine_order(int i915)
>> +{
>> +	const uint64_t sz = 512ull << 20;
>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>> +		{ .handle = gem_create(i915, 4096) },
>> +		{ .handle = batch_create(i915, sz) },
>> +	};
>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>> +		.buffers_ptr = to_user_pointer(obj),
>> +		.buffer_count = ARRAY_SIZE(obj),
>> +	};
>> +	const struct intel_execution_engine2 *e;
>> +	unsigned int engines[2], reverse[2];
>> +	uint64_t forward, backward, both;
>> +	unsigned int num_engines;
>> +	const intel_ctx_t *ctx;
>> +	int min, max;
>> +	double freq;
>> +	int sysfs;
>> +
>> +	/*
>> +	 * Check the order of fences found during GEM_WAIT does not affect
>> +	 * waitboosting. (See fence_order())
>> +	 *
>> +	 * Another way we can manipulate the order of fences within the
>> +	 * dma-resv is through repeated use of the same contexts.
>> +	 */
>> +
>> +	num_engines = 0;
>> +	ctx = intel_ctx_create_all_physical(i915);
>> +	for_each_ctx_engine(i915, ctx, e) {
>> +		/*
>> +		 * Avoid using the cmdparser as it will try to allocate
>> +		 * a new shadow batch for each submission -> oom
>> +		 */
>> +		if (!gem_engine_has_mutable_submission(i915, e->class))
>> +			continue;
>> +
>> +		engines[num_engines++] = e->flags;
>> +		if (num_engines == ARRAY_SIZE(engines))
>> +			break;
>> +	}
>> +	igt_require(num_engines > 1);
>> +	for (unsigned int n = 0; n < num_engines; n++)
>> +		reverse[n] = engines[num_engines - n - 1];
>> +	execbuf.rsvd1 = ctx->id;
>> +
>> +	sysfs = igt_sysfs_open(i915);
>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> +	igt_require(max > min);
>> +
>> +	/* Only allow ourselves to upclock via waitboosting */
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> +	/* Warm up to bind the vma */
>> +	gem_execbuf(i915, &execbuf);
>> +
>> +	forward = __engine_order(i915, &obj[0], &execbuf,
>> +				 engines, engines, num_engines,
>> +				 &freq);
>> +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>> +
>> +	backward = __engine_order(i915, &obj[0], &execbuf,
>> +				  reverse, reverse, num_engines,
>> +				  &freq);
>> +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>> +
>> +	both = __engine_order(i915, &obj[0], &execbuf,
>> +			      engines, reverse, num_engines,
>> +			      &freq);
>> +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>> +
>> +	gem_close(i915, obj[0].handle);
>> +	gem_close(i915, obj[1].handle);
>> +	intel_ctx_destroy(i915, ctx);
>> +
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>> +
>> +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
>> +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>> +}
>> +
>>   static void pm_rps_exit_handler(int sig)
>>   {
>> -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> -	} else {
>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> +	if (sysfs_files[MAX].filp) {
>> +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> +		} else {
>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> +		}
>>   	}
>>
>>   	if (lh.igt_proc.running)
>> @@ -683,6 +924,14 @@ igt_main
>>   	igt_subtest("waitboost")
>>   		waitboost(drm_fd, false);
>>
>> +	igt_describe("Check if the order of fences does not affect waitboosting");
>> +	igt_subtest("fence-order")
>> +		fence_order(drm_fd);
>> +
>> +	igt_describe("Check if context reuse does not affect waitboosting");
>> +	igt_subtest("engine-order")
>> +		engine_order(drm_fd);
>> +
>>   	/* Test boost frequency after GPU reset */
>>   	igt_subtest("reset") {
>>   		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>> --
>> 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
  2022-07-18  9:46   ` Karolina Drobnik
@ 2022-07-25  7:41     ` Karolina Drobnik
  2022-07-25 17:06     ` Kamil Konieczny
  1 sibling, 0 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-25  7:41 UTC (permalink / raw)
  To: Kamil Konieczny, Zbigniew Kempczyński; +Cc: igt-dev, Chris Wilson

Hi Kamil and Zbigniew,

Do you have any comments to the patch or my replies? If not, could any 
of you r-b this patch?

Many thanks,
Karolina

On 18.07.2022 11:46, Karolina Drobnik wrote:
> Hi,
> 
> On 15.07.2022 19:30, Kamil Konieczny wrote:
>> Hi,
>>
>> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>
>>> Currently the wait boost heuristic is evaluated at the start of each
>>> fence wait for a series within dma-resv. There is no strict ordering of
>>> fences defined by dma-resv, and so it turns out that the same operation
>>> under different circumstances can result in different heuristics being
>>> applied, and dramatic performance variations in user applications.
>>>
>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>>> ---
>>> v4:
>>>    - Added test descriptions with igt_describe()
>>> v3:
>>>    - Removed unnecessary calls to gem_write
>>> v2:
>>>    - Introduced batch_create() with arbitration points to prevent hangs
>>>    - Added engine-order subtest
>>>    - Added reporting of GPU frequency
>>>    - Changed runtimes and switched prints from us to ms
>>>
>>>   tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>>>   1 file changed, 255 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>>> index b37f15c2..6125ea60 100644
>>> --- a/tests/i915/i915_pm_rps.c
>>> +++ b/tests/i915/i915_pm_rps.c
>>> @@ -37,8 +37,10 @@
>>>   #include <sys/wait.h>
>>>
>>>   #include "i915/gem.h"
>>> +#include "i915/gem_create.h"
>>>   #include "igt.h"
>>>   #include "igt_dummyload.h"
>>> +#include "igt_perf.h"
>>>   #include "igt_sysfs.h"
>>>
>>>   IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency 
>>> changes");
>>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>>>       igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>>>   }
>>>
>>> +static uint32_t batch_create(int i915, uint64_t sz)
>>> +{
>>> +    const uint32_t bbe = MI_BATCH_BUFFER_END;
>>> +    const uint32_t chk = 0x5 << 23;
>>> +    uint32_t handle = gem_create(i915, sz);
>>> +
>>> +    for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>>> +        gem_write(i915, handle, pg, &chk, sizeof(chk));
>>> +
>>> +    gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>>> +
>>> +    return handle;
>>> +}
>>> +
>>> +static uint64_t __fence_order(int i915,
>>> +                  struct drm_i915_gem_exec_object2 *obj,
>>> +                  struct drm_i915_gem_execbuffer2 *eb,
>>> +                  uint64_t flags0, uint64_t flags1,
>>> +                  double *outf)
>>> +{
>>> +    uint64_t before[2], after[2];
>>> +    struct timespec tv;
>>> +    int fd;
>>> +
>>> +    gem_quiescent_gpu(i915);
>>> +    fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>> +
>>> +    igt_gettime(&tv);
>>> +
>>> +    obj->flags = flags0;
>>> +    gem_execbuf(i915, eb);
>>> +
>>> +    obj->flags = flags1;
>>> +    gem_execbuf(i915, eb);
>>> +
>>> +    read(fd, before, sizeof(before));
>>> +    gem_sync(i915, obj->handle);
>>> +    read(fd, after, sizeof(after));
>>
>> imho here is better place to read elapsed time.
> 
> We would like to measure time of the whole operation, so this is correct 
> as it is now
> 
>>> +    close(fd);
>>> +
>>> +    after[0] -= before[0];
>>> +    after[1] -= before[1];
>>> +
>>> +    *outf = 1e9 * after[0] / after[1];
>>> +    return igt_nsec_elapsed(&tv);
>>
>> See comment above.
>>
>>> +}
>>> +
>>> +static void fence_order(int i915)
>>> +{
>>> +    const uint64_t sz = 512ull << 20;
>>> +    struct drm_i915_gem_exec_object2 obj[2] = {
>>> +        { .handle = gem_create(i915, 4096) },
>>> +        { .handle = batch_create(i915, sz) },
>>> +    };
>>> +    struct drm_i915_gem_execbuffer2 execbuf = {
>>> +        .buffers_ptr = to_user_pointer(obj),
>>> +        .buffer_count = ARRAY_SIZE(obj),
>>> +    };
>>> +    uint64_t wr, rw;
>>> +    int min, max;
>>> +    double freq;
>>> +    int sysfs;
>>> +
>>> +    /*
>>> +     * Check the order of fences found during GEM_WAIT does not affect
>>> +     * waitboosting.
>>> +     *
>>> +     * Internally, implicit fences are tracked within a dma-resv which
>>> +     * imposes no order on the individually fences tracked within. 
>>> Since
>>> +     * there is no defined order, the sequence of waits (and the 
>>> associated
>>> +     * waitboosts) is also undefined, undermining the consistency of 
>>> the
>>> +     * waitboost heuristic.
>>> +     *
>>> +     * In particular, we can influence the sequence of fence storage
>>> +     * within dma-resv by mixing read/write semantics for implicit 
>>> fences.
>>> +     * We can exploit this property of dma-resv to exercise that no 
>>> matter
>>> +     * the stored order, the heuristic is applied consistently for the
>>> +     * user's GEM_WAIT ioctl.
>>> +     */
>>> +
>>> +    sysfs = igt_sysfs_open(i915);
>>> +    min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>> +    max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>> +    igt_require(max > min);
>>> +
>>> +    /* Only allow ourselves to upclock via waitboosting */
>>> +    igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> +    igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>> +    igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>> +
>>> +    /* Warm up to bind the vma */
>>> +    __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>>> +
>>> +    wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 
>>> 0, &freq);
>>> +    igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>>
>> imho this should be igt_debug (here and below)
>> s/igt_print/igt_debug/
> 
> This information is essential in verifying the test results, so igt_info 
> is required
> 
> All the best,
> Karolina
> 
>> Regards,
>> Kamil
>>
>>> +
>>> +    rw = __fence_order(i915, &obj[0], &execbuf, 0, 
>>> EXEC_OBJECT_WRITE, &freq);
>>> +    igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>>> +
>>> +    gem_close(i915, obj[0].handle);
>>> +    gem_close(i915, obj[1].handle);
>>> +
>>> +    igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> +    igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>> +
>>> +    igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>>> +}
>>> +
>>> +static uint64_t __engine_order(int i915,
>>> +                   struct drm_i915_gem_exec_object2 *obj,
>>> +                   struct drm_i915_gem_execbuffer2 *eb,
>>> +                   unsigned int *engines0,
>>> +                   unsigned int *engines1,
>>> +                   unsigned int num_engines,
>>> +                   double *outf)
>>> +{
>>> +    uint64_t before[2], after[2];
>>> +    struct timespec tv;
>>> +    int fd;
>>> +
>>> +    gem_quiescent_gpu(i915);
>>> +    fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>> +
>>> +    igt_gettime(&tv);
>>> +
>>> +    obj->flags = EXEC_OBJECT_WRITE;
>>> +    for (unsigned int n = 0; n < num_engines; n++) {
>>> +        eb->flags &= ~63ull;
>>> +        eb->flags |= engines0[n];
>>> +        gem_execbuf_wr(i915, eb);
>>> +    }
>>> +
>>> +    obj->flags = 0;
>>> +    for (unsigned int n = 0; n < num_engines; n++) {
>>> +        eb->flags &= ~63ull;
>>> +        eb->flags |= engines1[n];
>>> +        gem_execbuf(i915, eb);
>>> +    }
>>> +
>>> +    read(fd, before, sizeof(before));
>>> +    gem_sync(i915, obj->handle);
>>> +    read(fd, after, sizeof(after));
>>> +    close(fd);
>>> +
>>> +    after[0] -= before[0];
>>> +    after[1] -= before[1];
>>> +
>>> +    *outf = 1e9 * after[0] / after[1];
>>> +    return igt_nsec_elapsed(&tv);
>>> +}
>>> +
>>> +static void engine_order(int i915)
>>> +{
>>> +    const uint64_t sz = 512ull << 20;
>>> +    struct drm_i915_gem_exec_object2 obj[2] = {
>>> +        { .handle = gem_create(i915, 4096) },
>>> +        { .handle = batch_create(i915, sz) },
>>> +    };
>>> +    struct drm_i915_gem_execbuffer2 execbuf = {
>>> +        .buffers_ptr = to_user_pointer(obj),
>>> +        .buffer_count = ARRAY_SIZE(obj),
>>> +    };
>>> +    const struct intel_execution_engine2 *e;
>>> +    unsigned int engines[2], reverse[2];
>>> +    uint64_t forward, backward, both;
>>> +    unsigned int num_engines;
>>> +    const intel_ctx_t *ctx;
>>> +    int min, max;
>>> +    double freq;
>>> +    int sysfs;
>>> +
>>> +    /*
>>> +     * Check the order of fences found during GEM_WAIT does not affect
>>> +     * waitboosting. (See fence_order())
>>> +     *
>>> +     * Another way we can manipulate the order of fences within the
>>> +     * dma-resv is through repeated use of the same contexts.
>>> +     */
>>> +
>>> +    num_engines = 0;
>>> +    ctx = intel_ctx_create_all_physical(i915);
>>> +    for_each_ctx_engine(i915, ctx, e) {
>>> +        /*
>>> +         * Avoid using the cmdparser as it will try to allocate
>>> +         * a new shadow batch for each submission -> oom
>>> +         */
>>> +        if (!gem_engine_has_mutable_submission(i915, e->class))
>>> +            continue;
>>> +
>>> +        engines[num_engines++] = e->flags;
>>> +        if (num_engines == ARRAY_SIZE(engines))
>>> +            break;
>>> +    }
>>> +    igt_require(num_engines > 1);
>>> +    for (unsigned int n = 0; n < num_engines; n++)
>>> +        reverse[n] = engines[num_engines - n - 1];
>>> +    execbuf.rsvd1 = ctx->id;
>>> +
>>> +    sysfs = igt_sysfs_open(i915);
>>> +    min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>> +    max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>> +    igt_require(max > min);
>>> +
>>> +    /* Only allow ourselves to upclock via waitboosting */
>>> +    igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> +    igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>> +    igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>> +
>>> +    /* Warm up to bind the vma */
>>> +    gem_execbuf(i915, &execbuf);
>>> +
>>> +    forward = __engine_order(i915, &obj[0], &execbuf,
>>> +                 engines, engines, num_engines,
>>> +                 &freq);
>>> +    igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>>> +
>>> +    backward = __engine_order(i915, &obj[0], &execbuf,
>>> +                  reverse, reverse, num_engines,
>>> +                  &freq);
>>> +    igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>>> +
>>> +    both = __engine_order(i915, &obj[0], &execbuf,
>>> +                  engines, reverse, num_engines,
>>> +                  &freq);
>>> +    igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>>> +
>>> +    gem_close(i915, obj[0].handle);
>>> +    gem_close(i915, obj[1].handle);
>>> +    intel_ctx_destroy(i915, ctx);
>>> +
>>> +    igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>> +    igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>> +
>>> +    igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * 
>>> forward);
>>> +    igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>>> +}
>>> +
>>>   static void pm_rps_exit_handler(int sig)
>>>   {
>>> -    if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>> -        writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> -        writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> -    } else {
>>> -        writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> -        writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> +    if (sysfs_files[MAX].filp) {
>>> +        if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>> +            writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> +            writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> +        } else {
>>> +            writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>> +            writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>> +        }
>>>       }
>>>
>>>       if (lh.igt_proc.running)
>>> @@ -683,6 +924,14 @@ igt_main
>>>       igt_subtest("waitboost")
>>>           waitboost(drm_fd, false);
>>>
>>> +    igt_describe("Check if the order of fences does not affect 
>>> waitboosting");
>>> +    igt_subtest("fence-order")
>>> +        fence_order(drm_fd);
>>> +
>>> +    igt_describe("Check if context reuse does not affect 
>>> waitboosting");
>>> +    igt_subtest("engine-order")
>>> +        engine_order(drm_fd);
>>> +
>>>       /* Test boost frequency after GPU reset */
>>>       igt_subtest("reset") {
>>>           igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>>> -- 
>>> 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
  2022-07-18  9:46   ` Karolina Drobnik
  2022-07-25  7:41     ` Karolina Drobnik
@ 2022-07-25 17:06     ` Kamil Konieczny
  1 sibling, 0 replies; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-25 17:06 UTC (permalink / raw)
  To: igt-dev; +Cc: Chris Wilson

Hi,

On 2022-07-18 at 11:46:04 +0200, Karolina Drobnik wrote:
> Hi,
> 
> On 15.07.2022 19:30, Kamil Konieczny wrote:
> > Hi,
> > 
> > On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > > 
> > > Currently the wait boost heuristic is evaluated at the start of each
> > > fence wait for a series within dma-resv. There is no strict ordering of
> > > fences defined by dma-resv, and so it turns out that the same operation
> > > under different circumstances can result in different heuristics being
> > > applied, and dramatic performance variations in user applications.
> > > 
> > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> > > ---
> > > v4:
> > >    - Added test descriptions with igt_describe()
> > > v3:
> > >    - Removed unnecessary calls to gem_write
> > > v2:
> > >    - Introduced batch_create() with arbitration points to prevent hangs
> > >    - Added engine-order subtest
> > >    - Added reporting of GPU frequency
> > >    - Changed runtimes and switched prints from us to ms
> > > 
> > >   tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
> > >   1 file changed, 255 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> > > index b37f15c2..6125ea60 100644
> > > --- a/tests/i915/i915_pm_rps.c
> > > +++ b/tests/i915/i915_pm_rps.c
> > > @@ -37,8 +37,10 @@
> > >   #include <sys/wait.h>
> > > 
> > >   #include "i915/gem.h"
> > > +#include "i915/gem_create.h"
> > >   #include "igt.h"
> > >   #include "igt_dummyload.h"
> > > +#include "igt_perf.h"
> > >   #include "igt_sysfs.h"
> > > 
> > >   IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> > > @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
> > >   	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
> > >   }
> > > 
> > > +static uint32_t batch_create(int i915, uint64_t sz)
> > > +{
> > > +	const uint32_t bbe = MI_BATCH_BUFFER_END;
> > > +	const uint32_t chk = 0x5 << 23;
> > > +	uint32_t handle = gem_create(i915, sz);
> > > +
> > > +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> > > +		gem_write(i915, handle, pg, &chk, sizeof(chk));
> > > +
> > > +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> > > +
> > > +	return handle;
> > > +}
> > > +
> > > +static uint64_t __fence_order(int i915,
> > > +			      struct drm_i915_gem_exec_object2 *obj,
> > > +			      struct drm_i915_gem_execbuffer2 *eb,
> > > +			      uint64_t flags0, uint64_t flags1,
> > > +			      double *outf)
> > > +{
> > > +	uint64_t before[2], after[2];
> > > +	struct timespec tv;
> > > +	int fd;
> > > +
> > > +	gem_quiescent_gpu(i915);
> > > +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> > > +
> > > +	igt_gettime(&tv);
> > > +
> > > +	obj->flags = flags0;
> > > +	gem_execbuf(i915, eb);
> > > +
> > > +	obj->flags = flags1;
> > > +	gem_execbuf(i915, eb);
> > > +
> > > +	read(fd, before, sizeof(before));
> > > +	gem_sync(i915, obj->handle);
> > > +	read(fd, after, sizeof(after));
> > 
> > imho here is better place to read elapsed time.
> 
> We would like to measure time of the whole operation, so this is correct as
> it is now

ok.

> > > +	close(fd);
> > > +
> > > +	after[0] -= before[0];
> > > +	after[1] -= before[1];
> > > +
> > > +	*outf = 1e9 * after[0] / after[1];
> > > +	return igt_nsec_elapsed(&tv);
> > 
> > See comment above.
> > 
> > > +}
> > > +
> > > +static void fence_order(int i915)
> > > +{
> > > +	const uint64_t sz = 512ull << 20;
> > > +	struct drm_i915_gem_exec_object2 obj[2] = {
> > > +		{ .handle = gem_create(i915, 4096) },
> > > +		{ .handle = batch_create(i915, sz) },
> > > +	};
> > > +	struct drm_i915_gem_execbuffer2 execbuf = {
> > > +		.buffers_ptr = to_user_pointer(obj),
> > > +		.buffer_count = ARRAY_SIZE(obj),
> > > +	};
> > > +	uint64_t wr, rw;
> > > +	int min, max;
> > > +	double freq;
> > > +	int sysfs;
> > > +
> > > +	/*
> > > +	 * Check the order of fences found during GEM_WAIT does not affect
> > > +	 * waitboosting.
> > > +	 *
> > > +	 * Internally, implicit fences are tracked within a dma-resv which
> > > +	 * imposes no order on the individually fences tracked within. Since
> > > +	 * there is no defined order, the sequence of waits (and the associated
> > > +	 * waitboosts) is also undefined, undermining the consistency of the
> > > +	 * waitboost heuristic.
> > > +	 *
> > > +	 * In particular, we can influence the sequence of fence storage
> > > +	 * within dma-resv by mixing read/write semantics for implicit fences.
> > > +	 * We can exploit this property of dma-resv to exercise that no matter
> > > +	 * the stored order, the heuristic is applied consistently for the
> > > +	 * user's GEM_WAIT ioctl.
> > > +	 */
> > > +
> > > +	sysfs = igt_sysfs_open(i915);
> > > +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> > > +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> > > +	igt_require(max > min);
> > > +
> > > +	/* Only allow ourselves to upclock via waitboosting */
> > > +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> > > +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> > > +
> > > +	/* Warm up to bind the vma */
> > > +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> > > +
> > > +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> > > +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> > 
> > imho this should be igt_debug (here and below)
> > s/igt_print/igt_debug/
> 
> This information is essential in verifying the test results, so igt_info is
> required
> 
> All the best,
> Karolina

Well, verifing is done by igt_asserts, do we intend to look into
logs for verifing ? If asserts fails then previous debugs will
be printed. While at this, maybe it is also worth to print out
min and max freqs.

Regards,
Kamil

> > > +
> > > +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> > > +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> > > +
> > > +	gem_close(i915, obj[0].handle);
> > > +	gem_close(i915, obj[1].handle);
> > > +
> > > +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> > > +
> > > +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> > > +}
> > > +
> > > +static uint64_t __engine_order(int i915,
> > > +			       struct drm_i915_gem_exec_object2 *obj,
> > > +			       struct drm_i915_gem_execbuffer2 *eb,
> > > +			       unsigned int *engines0,
> > > +			       unsigned int *engines1,
> > > +			       unsigned int num_engines,
> > > +			       double *outf)
> > > +{
> > > +	uint64_t before[2], after[2];
> > > +	struct timespec tv;
> > > +	int fd;
> > > +
> > > +	gem_quiescent_gpu(i915);
> > > +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> > > +
> > > +	igt_gettime(&tv);
> > > +
> > > +	obj->flags = EXEC_OBJECT_WRITE;
> > > +	for (unsigned int n = 0; n < num_engines; n++) {
> > > +		eb->flags &= ~63ull;
> > > +		eb->flags |= engines0[n];
> > > +		gem_execbuf_wr(i915, eb);
> > > +	}
> > > +
> > > +	obj->flags = 0;
> > > +	for (unsigned int n = 0; n < num_engines; n++) {
> > > +		eb->flags &= ~63ull;
> > > +		eb->flags |= engines1[n];
> > > +		gem_execbuf(i915, eb);
> > > +	}
> > > +
> > > +	read(fd, before, sizeof(before));
> > > +	gem_sync(i915, obj->handle);
> > > +	read(fd, after, sizeof(after));
> > > +	close(fd);
> > > +
> > > +	after[0] -= before[0];
> > > +	after[1] -= before[1];
> > > +
> > > +	*outf = 1e9 * after[0] / after[1];
> > > +	return igt_nsec_elapsed(&tv);
> > > +}
> > > +
> > > +static void engine_order(int i915)
> > > +{
> > > +	const uint64_t sz = 512ull << 20;
> > > +	struct drm_i915_gem_exec_object2 obj[2] = {
> > > +		{ .handle = gem_create(i915, 4096) },
> > > +		{ .handle = batch_create(i915, sz) },
> > > +	};
> > > +	struct drm_i915_gem_execbuffer2 execbuf = {
> > > +		.buffers_ptr = to_user_pointer(obj),
> > > +		.buffer_count = ARRAY_SIZE(obj),
> > > +	};
> > > +	const struct intel_execution_engine2 *e;
> > > +	unsigned int engines[2], reverse[2];
> > > +	uint64_t forward, backward, both;
> > > +	unsigned int num_engines;
> > > +	const intel_ctx_t *ctx;
> > > +	int min, max;
> > > +	double freq;
> > > +	int sysfs;
> > > +
> > > +	/*
> > > +	 * Check the order of fences found during GEM_WAIT does not affect
> > > +	 * waitboosting. (See fence_order())
> > > +	 *
> > > +	 * Another way we can manipulate the order of fences within the
> > > +	 * dma-resv is through repeated use of the same contexts.
> > > +	 */
> > > +
> > > +	num_engines = 0;
> > > +	ctx = intel_ctx_create_all_physical(i915);
> > > +	for_each_ctx_engine(i915, ctx, e) {
> > > +		/*
> > > +		 * Avoid using the cmdparser as it will try to allocate
> > > +		 * a new shadow batch for each submission -> oom
> > > +		 */
> > > +		if (!gem_engine_has_mutable_submission(i915, e->class))
> > > +			continue;
> > > +
> > > +		engines[num_engines++] = e->flags;
> > > +		if (num_engines == ARRAY_SIZE(engines))
> > > +			break;
> > > +	}
> > > +	igt_require(num_engines > 1);
> > > +	for (unsigned int n = 0; n < num_engines; n++)
> > > +		reverse[n] = engines[num_engines - n - 1];
> > > +	execbuf.rsvd1 = ctx->id;
> > > +
> > > +	sysfs = igt_sysfs_open(i915);
> > > +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> > > +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> > > +	igt_require(max > min);
> > > +
> > > +	/* Only allow ourselves to upclock via waitboosting */
> > > +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> > > +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> > > +
> > > +	/* Warm up to bind the vma */
> > > +	gem_execbuf(i915, &execbuf);
> > > +
> > > +	forward = __engine_order(i915, &obj[0], &execbuf,
> > > +				 engines, engines, num_engines,
> > > +				 &freq);
> > > +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> > > +
> > > +	backward = __engine_order(i915, &obj[0], &execbuf,
> > > +				  reverse, reverse, num_engines,
> > > +				  &freq);
> > > +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> > > +
> > > +	both = __engine_order(i915, &obj[0], &execbuf,
> > > +			      engines, reverse, num_engines,
> > > +			      &freq);
> > > +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> > > +
> > > +	gem_close(i915, obj[0].handle);
> > > +	gem_close(i915, obj[1].handle);
> > > +	intel_ctx_destroy(i915, ctx);
> > > +
> > > +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> > > +
> > > +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> > > +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> > > +}
> > > +
> > >   static void pm_rps_exit_handler(int sig)
> > >   {
> > > -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> > > -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > -	} else {
> > > -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > +	if (sysfs_files[MAX].filp) {
> > > +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> > > +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > +		} else {
> > > +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> > > +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> > > +		}
> > >   	}
> > > 
> > >   	if (lh.igt_proc.running)
> > > @@ -683,6 +924,14 @@ igt_main
> > >   	igt_subtest("waitboost")
> > >   		waitboost(drm_fd, false);
> > > 
> > > +	igt_describe("Check if the order of fences does not affect waitboosting");
> > > +	igt_subtest("fence-order")
> > > +		fence_order(drm_fd);
> > > +
> > > +	igt_describe("Check if context reuse does not affect waitboosting");
> > > +	igt_subtest("engine-order")
> > > +		engine_order(drm_fd);
> > > +
> > >   	/* Test boost frequency after GPU reset */
> > >   	igt_subtest("reset") {
> > >   		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> > > --
> > > 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
  2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
  2022-07-15 17:30 ` Kamil Konieczny
@ 2022-07-26 10:49 ` Kamil Konieczny
  2022-07-27 12:27   ` Karolina Drobnik
  2022-07-26 14:18 ` Kamil Konieczny
  2 siblings, 1 reply; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-26 10:49 UTC (permalink / raw)
  To: igt-dev; +Cc: Chris Wilson

Hi Karolina,

few more nits, see below.

On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Currently the wait boost heuristic is evaluated at the start of each
> fence wait for a series within dma-resv. There is no strict ordering of
> fences defined by dma-resv, and so it turns out that the same operation
> under different circumstances can result in different heuristics being
> applied, and dramatic performance variations in user applications.
> 
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> ---
> v4:
>   - Added test descriptions with igt_describe()
> v3:
>   - Removed unnecessary calls to gem_write
> v2:
>   - Introduced batch_create() with arbitration points to prevent hangs
>   - Added engine-order subtest
>   - Added reporting of GPU frequency
>   - Changed runtimes and switched prints from us to ms
> 
>  tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 255 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> index b37f15c2..6125ea60 100644
> --- a/tests/i915/i915_pm_rps.c
> +++ b/tests/i915/i915_pm_rps.c
> @@ -37,8 +37,10 @@
>  #include <sys/wait.h>
> 
>  #include "i915/gem.h"
> +#include "i915/gem_create.h"
>  #include "igt.h"
>  #include "igt_dummyload.h"
> +#include "igt_perf.h"
>  #include "igt_sysfs.h"
> 
>  IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>  	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>  }
> 
> +static uint32_t batch_create(int i915, uint64_t sz)
> +{
> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
> +	const uint32_t chk = 0x5 << 23;
> +	uint32_t handle = gem_create(i915, sz);
> +
> +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> +		gem_write(i915, handle, pg, &chk, sizeof(chk));
> +
> +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> +
> +	return handle;
> +}
> +
> +static uint64_t __fence_order(int i915,
> +			      struct drm_i915_gem_exec_object2 *obj,
> +			      struct drm_i915_gem_execbuffer2 *eb,
> +			      uint64_t flags0, uint64_t flags1,
> +			      double *outf)
> +{
> +	uint64_t before[2], after[2];
> +	struct timespec tv;
> +	int fd;
> +
> +	gem_quiescent_gpu(i915);
> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> +	igt_gettime(&tv);
> +
> +	obj->flags = flags0;
> +	gem_execbuf(i915, eb);
> +
> +	obj->flags = flags1;
> +	gem_execbuf(i915, eb);
> +
> +	read(fd, before, sizeof(before));
> +	gem_sync(i915, obj->handle);
> +	read(fd, after, sizeof(after));
> +	close(fd);
> +
> +	after[0] -= before[0];
> +	after[1] -= before[1];

Put igt_assert here (check for zero before divide).

> +
> +	*outf = 1e9 * after[0] / after[1];
> +	return igt_nsec_elapsed(&tv);
> +}
> +
> +static void fence_order(int i915)
> +{
> +	const uint64_t sz = 512ull << 20;
> +	struct drm_i915_gem_exec_object2 obj[2] = {
> +		{ .handle = gem_create(i915, 4096) },
> +		{ .handle = batch_create(i915, sz) },
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(obj),
> +		.buffer_count = ARRAY_SIZE(obj),
> +	};
> +	uint64_t wr, rw;
> +	int min, max;
> +	double freq;
> +	int sysfs;
> +
> +	/*
> +	 * Check the order of fences found during GEM_WAIT does not affect
> +	 * waitboosting.
> +	 *
> +	 * Internally, implicit fences are tracked within a dma-resv which
> +	 * imposes no order on the individually fences tracked within. Since
> +	 * there is no defined order, the sequence of waits (and the associated
> +	 * waitboosts) is also undefined, undermining the consistency of the
> +	 * waitboost heuristic.
> +	 *
> +	 * In particular, we can influence the sequence of fence storage
> +	 * within dma-resv by mixing read/write semantics for implicit fences.
> +	 * We can exploit this property of dma-resv to exercise that no matter
> +	 * the stored order, the heuristic is applied consistently for the
> +	 * user's GEM_WAIT ioctl.
> +	 */
> +
> +	sysfs = igt_sysfs_open(i915);
> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> +	igt_require(max > min);
> +
> +	/* Only allow ourselves to upclock via waitboosting */
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> +	/* Warm up to bind the vma */
> +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> +
> +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> +
> +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> +
> +	gem_close(i915, obj[0].handle);
> +	gem_close(i915, obj[1].handle);

Close sysfs here.

> +
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);

Here we should put previous values, not those readed from RPx,
but if they are always the same it is ok.

> +
> +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> +}
> +
> +static uint64_t __engine_order(int i915,
> +			       struct drm_i915_gem_exec_object2 *obj,
> +			       struct drm_i915_gem_execbuffer2 *eb,
> +			       unsigned int *engines0,
> +			       unsigned int *engines1,
> +			       unsigned int num_engines,
> +			       double *outf)
> +{
> +	uint64_t before[2], after[2];
> +	struct timespec tv;
> +	int fd;
> +
> +	gem_quiescent_gpu(i915);
> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> +	igt_gettime(&tv);
> +
> +	obj->flags = EXEC_OBJECT_WRITE;
> +	for (unsigned int n = 0; n < num_engines; n++) {
> +		eb->flags &= ~63ull;
> +		eb->flags |= engines0[n];
> +		gem_execbuf_wr(i915, eb);
> +	}
> +
> +	obj->flags = 0;
> +	for (unsigned int n = 0; n < num_engines; n++) {
> +		eb->flags &= ~63ull;
> +		eb->flags |= engines1[n];
> +		gem_execbuf(i915, eb);
> +	}
> +
> +	read(fd, before, sizeof(before));
> +	gem_sync(i915, obj->handle);
> +	read(fd, after, sizeof(after));
> +	close(fd);
> +
> +	after[0] -= before[0];
> +	after[1] -= before[1];

Put igt_assert here (check for zero before divide).

> +
> +	*outf = 1e9 * after[0] / after[1];
> +	return igt_nsec_elapsed(&tv);
> +}
> +
> +static void engine_order(int i915)
> +{
> +	const uint64_t sz = 512ull << 20;
> +	struct drm_i915_gem_exec_object2 obj[2] = {
> +		{ .handle = gem_create(i915, 4096) },
> +		{ .handle = batch_create(i915, sz) },
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(obj),
> +		.buffer_count = ARRAY_SIZE(obj),
> +	};
> +	const struct intel_execution_engine2 *e;
> +	unsigned int engines[2], reverse[2];
> +	uint64_t forward, backward, both;
> +	unsigned int num_engines;
> +	const intel_ctx_t *ctx;
> +	int min, max;
> +	double freq;
> +	int sysfs;
> +
> +	/*
> +	 * Check the order of fences found during GEM_WAIT does not affect
> +	 * waitboosting. (See fence_order())
> +	 *
> +	 * Another way we can manipulate the order of fences within the
> +	 * dma-resv is through repeated use of the same contexts.
> +	 */
> +
> +	num_engines = 0;
> +	ctx = intel_ctx_create_all_physical(i915);
> +	for_each_ctx_engine(i915, ctx, e) {
> +		/*
> +		 * Avoid using the cmdparser as it will try to allocate
> +		 * a new shadow batch for each submission -> oom
> +		 */
> +		if (!gem_engine_has_mutable_submission(i915, e->class))
> +			continue;
> +
> +		engines[num_engines++] = e->flags;
> +		if (num_engines == ARRAY_SIZE(engines))
> +			break;
> +	}
> +	igt_require(num_engines > 1);
> +	for (unsigned int n = 0; n < num_engines; n++)
> +		reverse[n] = engines[num_engines - n - 1];
> +	execbuf.rsvd1 = ctx->id;
> +
> +	sysfs = igt_sysfs_open(i915);
> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> +	igt_require(max > min);
> +
> +	/* Only allow ourselves to upclock via waitboosting */
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> +	/* Warm up to bind the vma */
> +	gem_execbuf(i915, &execbuf);
> +
> +	forward = __engine_order(i915, &obj[0], &execbuf,
> +				 engines, engines, num_engines,
> +				 &freq);
> +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> +
> +	backward = __engine_order(i915, &obj[0], &execbuf,
> +				  reverse, reverse, num_engines,
> +				  &freq);
> +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> +
> +	both = __engine_order(i915, &obj[0], &execbuf,
> +			      engines, reverse, num_engines,
> +			      &freq);
> +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> +
> +	gem_close(i915, obj[0].handle);
> +	gem_close(i915, obj[1].handle);

Close sysfs here.

> +	intel_ctx_destroy(i915, ctx);
> +
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> +}
> +
>  static void pm_rps_exit_handler(int sig)
>  {
> -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> -	} else {
> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +	if (sysfs_files[MAX].filp) {
> +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> +		} else {
> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +		}

This change looks unrelated. Please split this as separate patch
and post as a fix.

Regards,
Kamil

>  	}
> 
>  	if (lh.igt_proc.running)
> @@ -683,6 +924,14 @@ igt_main
>  	igt_subtest("waitboost")
>  		waitboost(drm_fd, false);
> 
> +	igt_describe("Check if the order of fences does not affect waitboosting");
> +	igt_subtest("fence-order")
> +		fence_order(drm_fd);
> +
> +	igt_describe("Check if context reuse does not affect waitboosting");
> +	igt_subtest("engine-order")
> +		engine_order(drm_fd);
> +
>  	/* Test boost frequency after GPU reset */
>  	igt_subtest("reset") {
>  		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
  2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
  2022-07-15 17:30 ` Kamil Konieczny
  2022-07-26 10:49 ` Kamil Konieczny
@ 2022-07-26 14:18 ` Kamil Konieczny
  2 siblings, 0 replies; 10+ messages in thread
From: Kamil Konieczny @ 2022-07-26 14:18 UTC (permalink / raw)
  To: igt-dev; +Cc: Chris Wilson

Hi Karolina,

On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Currently the wait boost heuristic is evaluated at the start of each
> fence wait for a series within dma-resv. There is no strict ordering of
> fences defined by dma-resv, and so it turns out that the same operation
> under different circumstances can result in different heuristics being
> applied, and dramatic performance variations in user applications.
> 
> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> ---
> v4:
>   - Added test descriptions with igt_describe()
> v3:
>   - Removed unnecessary calls to gem_write
> v2:
>   - Introduced batch_create() with arbitration points to prevent hangs
>   - Added engine-order subtest
>   - Added reporting of GPU frequency
>   - Changed runtimes and switched prints from us to ms
> 
>  tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 255 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> index b37f15c2..6125ea60 100644
> --- a/tests/i915/i915_pm_rps.c
> +++ b/tests/i915/i915_pm_rps.c
> @@ -37,8 +37,10 @@
>  #include <sys/wait.h>
> 
>  #include "i915/gem.h"
> +#include "i915/gem_create.h"
>  #include "igt.h"
>  #include "igt_dummyload.h"
> +#include "igt_perf.h"
>  #include "igt_sysfs.h"
> 
>  IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>  	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>  }
> 
> +static uint32_t batch_create(int i915, uint64_t sz)
> +{
> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
> +	const uint32_t chk = 0x5 << 23;
> +	uint32_t handle = gem_create(i915, sz);
> +
> +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> +		gem_write(i915, handle, pg, &chk, sizeof(chk));
> +
> +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> +
> +	return handle;
> +}
> +
> +static uint64_t __fence_order(int i915,
> +			      struct drm_i915_gem_exec_object2 *obj,
> +			      struct drm_i915_gem_execbuffer2 *eb,
> +			      uint64_t flags0, uint64_t flags1,
> +			      double *outf)
> +{
> +	uint64_t before[2], after[2];
> +	struct timespec tv;
> +	int fd;
> +
> +	gem_quiescent_gpu(i915);
> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> +	igt_gettime(&tv);
> +
> +	obj->flags = flags0;
> +	gem_execbuf(i915, eb);
> +
> +	obj->flags = flags1;
> +	gem_execbuf(i915, eb);
> +
> +	read(fd, before, sizeof(before));
> +	gem_sync(i915, obj->handle);
> +	read(fd, after, sizeof(after));
> +	close(fd);
> +
> +	after[0] -= before[0];
> +	after[1] -= before[1];
> +
> +	*outf = 1e9 * after[0] / after[1];
> +	return igt_nsec_elapsed(&tv);
> +}
> +
> +static void fence_order(int i915)
> +{
> +	const uint64_t sz = 512ull << 20;

I tested this on one machine with drm-tip and test lasted for
around 300 sec, can we lower this to 128 MB ? It will run
around 15s, about 20 times faster.

Regards,
Kamil

> +	struct drm_i915_gem_exec_object2 obj[2] = {
> +		{ .handle = gem_create(i915, 4096) },
> +		{ .handle = batch_create(i915, sz) },
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(obj),
> +		.buffer_count = ARRAY_SIZE(obj),
> +	};
> +	uint64_t wr, rw;
> +	int min, max;
> +	double freq;
> +	int sysfs;
> +
> +	/*
> +	 * Check the order of fences found during GEM_WAIT does not affect
> +	 * waitboosting.
> +	 *
> +	 * Internally, implicit fences are tracked within a dma-resv which
> +	 * imposes no order on the individually fences tracked within. Since
> +	 * there is no defined order, the sequence of waits (and the associated
> +	 * waitboosts) is also undefined, undermining the consistency of the
> +	 * waitboost heuristic.
> +	 *
> +	 * In particular, we can influence the sequence of fence storage
> +	 * within dma-resv by mixing read/write semantics for implicit fences.
> +	 * We can exploit this property of dma-resv to exercise that no matter
> +	 * the stored order, the heuristic is applied consistently for the
> +	 * user's GEM_WAIT ioctl.
> +	 */
> +
> +	sysfs = igt_sysfs_open(i915);
> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> +	igt_require(max > min);
> +
> +	/* Only allow ourselves to upclock via waitboosting */
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> +	/* Warm up to bind the vma */
> +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> +
> +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> +
> +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> +
> +	gem_close(i915, obj[0].handle);
> +	gem_close(i915, obj[1].handle);
> +
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
> +}
> +
> +static uint64_t __engine_order(int i915,
> +			       struct drm_i915_gem_exec_object2 *obj,
> +			       struct drm_i915_gem_execbuffer2 *eb,
> +			       unsigned int *engines0,
> +			       unsigned int *engines1,
> +			       unsigned int num_engines,
> +			       double *outf)
> +{
> +	uint64_t before[2], after[2];
> +	struct timespec tv;
> +	int fd;
> +
> +	gem_quiescent_gpu(i915);
> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> +
> +	igt_gettime(&tv);
> +
> +	obj->flags = EXEC_OBJECT_WRITE;
> +	for (unsigned int n = 0; n < num_engines; n++) {
> +		eb->flags &= ~63ull;
> +		eb->flags |= engines0[n];
> +		gem_execbuf_wr(i915, eb);
> +	}
> +
> +	obj->flags = 0;
> +	for (unsigned int n = 0; n < num_engines; n++) {
> +		eb->flags &= ~63ull;
> +		eb->flags |= engines1[n];
> +		gem_execbuf(i915, eb);
> +	}
> +
> +	read(fd, before, sizeof(before));
> +	gem_sync(i915, obj->handle);
> +	read(fd, after, sizeof(after));
> +	close(fd);
> +
> +	after[0] -= before[0];
> +	after[1] -= before[1];
> +
> +	*outf = 1e9 * after[0] / after[1];
> +	return igt_nsec_elapsed(&tv);
> +}
> +
> +static void engine_order(int i915)
> +{
> +	const uint64_t sz = 512ull << 20;
> +	struct drm_i915_gem_exec_object2 obj[2] = {
> +		{ .handle = gem_create(i915, 4096) },
> +		{ .handle = batch_create(i915, sz) },
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(obj),
> +		.buffer_count = ARRAY_SIZE(obj),
> +	};
> +	const struct intel_execution_engine2 *e;
> +	unsigned int engines[2], reverse[2];
> +	uint64_t forward, backward, both;
> +	unsigned int num_engines;
> +	const intel_ctx_t *ctx;
> +	int min, max;
> +	double freq;
> +	int sysfs;
> +
> +	/*
> +	 * Check the order of fences found during GEM_WAIT does not affect
> +	 * waitboosting. (See fence_order())
> +	 *
> +	 * Another way we can manipulate the order of fences within the
> +	 * dma-resv is through repeated use of the same contexts.
> +	 */
> +
> +	num_engines = 0;
> +	ctx = intel_ctx_create_all_physical(i915);
> +	for_each_ctx_engine(i915, ctx, e) {
> +		/*
> +		 * Avoid using the cmdparser as it will try to allocate
> +		 * a new shadow batch for each submission -> oom
> +		 */
> +		if (!gem_engine_has_mutable_submission(i915, e->class))
> +			continue;
> +
> +		engines[num_engines++] = e->flags;
> +		if (num_engines == ARRAY_SIZE(engines))
> +			break;
> +	}
> +	igt_require(num_engines > 1);
> +	for (unsigned int n = 0; n < num_engines; n++)
> +		reverse[n] = engines[num_engines - n - 1];
> +	execbuf.rsvd1 = ctx->id;
> +
> +	sysfs = igt_sysfs_open(i915);
> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> +	igt_require(max > min);
> +
> +	/* Only allow ourselves to upclock via waitboosting */
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> +
> +	/* Warm up to bind the vma */
> +	gem_execbuf(i915, &execbuf);
> +
> +	forward = __engine_order(i915, &obj[0], &execbuf,
> +				 engines, engines, num_engines,
> +				 &freq);
> +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
> +
> +	backward = __engine_order(i915, &obj[0], &execbuf,
> +				  reverse, reverse, num_engines,
> +				  &freq);
> +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
> +
> +	both = __engine_order(i915, &obj[0], &execbuf,
> +			      engines, reverse, num_engines,
> +			      &freq);
> +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
> +
> +	gem_close(i915, obj[0].handle);
> +	gem_close(i915, obj[1].handle);
> +	intel_ctx_destroy(i915, ctx);
> +
> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> +
> +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
> +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
> +}
> +
>  static void pm_rps_exit_handler(int sig)
>  {
> -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> -	} else {
> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +	if (sysfs_files[MAX].filp) {
> +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> +		} else {
> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
> +		}
>  	}
> 
>  	if (lh.igt_proc.running)
> @@ -683,6 +924,14 @@ igt_main
>  	igt_subtest("waitboost")
>  		waitboost(drm_fd, false);
> 
> +	igt_describe("Check if the order of fences does not affect waitboosting");
> +	igt_subtest("fence-order")
> +		fence_order(drm_fd);
> +
> +	igt_describe("Check if context reuse does not affect waitboosting");
> +	igt_subtest("engine-order")
> +		engine_order(drm_fd);
> +
>  	/* Test boost frequency after GPU reset */
>  	igt_subtest("reset") {
>  		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
> --
> 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
  2022-07-26 10:49 ` Kamil Konieczny
@ 2022-07-27 12:27   ` Karolina Drobnik
       [not found]     ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
  0 siblings, 1 reply; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-27 12:27 UTC (permalink / raw)
  To: Kamil Konieczny; +Cc: igt-dev, Chris Wilson

Hi,

On 26.07.2022 12:49, Kamil Konieczny wrote:
> Hi Karolina,
> 
> few more nits, see below.
> 
> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> Currently the wait boost heuristic is evaluated at the start of each
>> fence wait for a series within dma-resv. There is no strict ordering of
>> fences defined by dma-resv, and so it turns out that the same operation
>> under different circumstances can result in different heuristics being
>> applied, and dramatic performance variations in user applications.
>>
>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>> ---
>> v4:
>>    - Added test descriptions with igt_describe()
>> v3:
>>    - Removed unnecessary calls to gem_write
>> v2:
>>    - Introduced batch_create() with arbitration points to prevent hangs
>>    - Added engine-order subtest
>>    - Added reporting of GPU frequency
>>    - Changed runtimes and switched prints from us to ms
>>
>>   tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 255 insertions(+), 6 deletions(-)
>>
>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>> index b37f15c2..6125ea60 100644
>> --- a/tests/i915/i915_pm_rps.c
>> +++ b/tests/i915/i915_pm_rps.c
>> @@ -37,8 +37,10 @@
>>   #include <sys/wait.h>
>>
>>   #include "i915/gem.h"
>> +#include "i915/gem_create.h"
>>   #include "igt.h"
>>   #include "igt_dummyload.h"
>> +#include "igt_perf.h"
>>   #include "igt_sysfs.h"
>>
>>   IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>>   	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>>   }
>>
>> +static uint32_t batch_create(int i915, uint64_t sz)
>> +{
>> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
>> +	const uint32_t chk = 0x5 << 23;
>> +	uint32_t handle = gem_create(i915, sz);
>> +
>> +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>> +		gem_write(i915, handle, pg, &chk, sizeof(chk));
>> +
>> +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>> +
>> +	return handle;
>> +}
>> +
>> +static uint64_t __fence_order(int i915,
>> +			      struct drm_i915_gem_exec_object2 *obj,
>> +			      struct drm_i915_gem_execbuffer2 *eb,
>> +			      uint64_t flags0, uint64_t flags1,
>> +			      double *outf)
>> +{
>> +	uint64_t before[2], after[2];
>> +	struct timespec tv;
>> +	int fd;
>> +
>> +	gem_quiescent_gpu(i915);
>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> +	igt_gettime(&tv);
>> +
>> +	obj->flags = flags0;
>> +	gem_execbuf(i915, eb);
>> +
>> +	obj->flags = flags1;
>> +	gem_execbuf(i915, eb);
>> +
>> +	read(fd, before, sizeof(before));
>> +	gem_sync(i915, obj->handle);
>> +	read(fd, after, sizeof(after));
>> +	close(fd);
>> +
>> +	after[0] -= before[0];
>> +	after[1] -= before[1];
> 
> Put igt_assert here (check for zero before divide).

Hmm, isn't that more of a belts and suspenders approach? It's unlikely 
we'd get zero here. Also, don't we put asserts towards the end of the 
test whenever possible?

>> +
>> +	*outf = 1e9 * after[0] / after[1];
>> +	return igt_nsec_elapsed(&tv);
>> +}
>> +
>> +static void fence_order(int i915)
>> +{
>> +	const uint64_t sz = 512ull << 20;
>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>> +		{ .handle = gem_create(i915, 4096) },
>> +		{ .handle = batch_create(i915, sz) },
>> +	};
>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>> +		.buffers_ptr = to_user_pointer(obj),
>> +		.buffer_count = ARRAY_SIZE(obj),
>> +	};
>> +	uint64_t wr, rw;
>> +	int min, max;
>> +	double freq;
>> +	int sysfs;
>> +
>> +	/*
>> +	 * Check the order of fences found during GEM_WAIT does not affect
>> +	 * waitboosting.
>> +	 *
>> +	 * Internally, implicit fences are tracked within a dma-resv which
>> +	 * imposes no order on the individually fences tracked within. Since
>> +	 * there is no defined order, the sequence of waits (and the associated
>> +	 * waitboosts) is also undefined, undermining the consistency of the
>> +	 * waitboost heuristic.
>> +	 *
>> +	 * In particular, we can influence the sequence of fence storage
>> +	 * within dma-resv by mixing read/write semantics for implicit fences.
>> +	 * We can exploit this property of dma-resv to exercise that no matter
>> +	 * the stored order, the heuristic is applied consistently for the
>> +	 * user's GEM_WAIT ioctl.
>> +	 */
>> +
>> +	sysfs = igt_sysfs_open(i915);
>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> +	igt_require(max > min);
>> +
>> +	/* Only allow ourselves to upclock via waitboosting */
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> +	/* Warm up to bind the vma */
>> +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>> +
>> +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
>> +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>> +
>> +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
>> +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>> +
>> +	gem_close(i915, obj[0].handle);
>> +	gem_close(i915, obj[1].handle);
> 
> Close sysfs here.

Good catch. I'll close it before asserts (as agreed).

>> +
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> 
> Here we should put previous values, not those readed from RPx,
> but if they are always the same it is ok.

We restore the values we read from sysfs

>> +
>> +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>> +}
>> +
>> +static uint64_t __engine_order(int i915,
>> +			       struct drm_i915_gem_exec_object2 *obj,
>> +			       struct drm_i915_gem_execbuffer2 *eb,
>> +			       unsigned int *engines0,
>> +			       unsigned int *engines1,
>> +			       unsigned int num_engines,
>> +			       double *outf)
>> +{
>> +	uint64_t before[2], after[2];
>> +	struct timespec tv;
>> +	int fd;
>> +
>> +	gem_quiescent_gpu(i915);
>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>> +
>> +	igt_gettime(&tv);
>> +
>> +	obj->flags = EXEC_OBJECT_WRITE;
>> +	for (unsigned int n = 0; n < num_engines; n++) {
>> +		eb->flags &= ~63ull;
>> +		eb->flags |= engines0[n];
>> +		gem_execbuf_wr(i915, eb);
>> +	}
>> +
>> +	obj->flags = 0;
>> +	for (unsigned int n = 0; n < num_engines; n++) {
>> +		eb->flags &= ~63ull;
>> +		eb->flags |= engines1[n];
>> +		gem_execbuf(i915, eb);
>> +	}
>> +
>> +	read(fd, before, sizeof(before));
>> +	gem_sync(i915, obj->handle);
>> +	read(fd, after, sizeof(after));
>> +	close(fd);
>> +
>> +	after[0] -= before[0];
>> +	after[1] -= before[1];
> 
> Put igt_assert here (check for zero before divide).
> 
>> +
>> +	*outf = 1e9 * after[0] / after[1];
>> +	return igt_nsec_elapsed(&tv);
>> +}
>> +
>> +static void engine_order(int i915)
>> +{
>> +	const uint64_t sz = 512ull << 20;
>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>> +		{ .handle = gem_create(i915, 4096) },
>> +		{ .handle = batch_create(i915, sz) },
>> +	};
>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>> +		.buffers_ptr = to_user_pointer(obj),
>> +		.buffer_count = ARRAY_SIZE(obj),
>> +	};
>> +	const struct intel_execution_engine2 *e;
>> +	unsigned int engines[2], reverse[2];
>> +	uint64_t forward, backward, both;
>> +	unsigned int num_engines;
>> +	const intel_ctx_t *ctx;
>> +	int min, max;
>> +	double freq;
>> +	int sysfs;
>> +
>> +	/*
>> +	 * Check the order of fences found during GEM_WAIT does not affect
>> +	 * waitboosting. (See fence_order())
>> +	 *
>> +	 * Another way we can manipulate the order of fences within the
>> +	 * dma-resv is through repeated use of the same contexts.
>> +	 */
>> +
>> +	num_engines = 0;
>> +	ctx = intel_ctx_create_all_physical(i915);
>> +	for_each_ctx_engine(i915, ctx, e) {
>> +		/*
>> +		 * Avoid using the cmdparser as it will try to allocate
>> +		 * a new shadow batch for each submission -> oom
>> +		 */
>> +		if (!gem_engine_has_mutable_submission(i915, e->class))
>> +			continue;
>> +
>> +		engines[num_engines++] = e->flags;
>> +		if (num_engines == ARRAY_SIZE(engines))
>> +			break;
>> +	}
>> +	igt_require(num_engines > 1);
>> +	for (unsigned int n = 0; n < num_engines; n++)
>> +		reverse[n] = engines[num_engines - n - 1];
>> +	execbuf.rsvd1 = ctx->id;
>> +
>> +	sysfs = igt_sysfs_open(i915);
>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>> +	igt_require(max > min);
>> +
>> +	/* Only allow ourselves to upclock via waitboosting */
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>> +
>> +	/* Warm up to bind the vma */
>> +	gem_execbuf(i915, &execbuf);
>> +
>> +	forward = __engine_order(i915, &obj[0], &execbuf,
>> +				 engines, engines, num_engines,
>> +				 &freq);
>> +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>> +
>> +	backward = __engine_order(i915, &obj[0], &execbuf,
>> +				  reverse, reverse, num_engines,
>> +				  &freq);
>> +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>> +
>> +	both = __engine_order(i915, &obj[0], &execbuf,
>> +			      engines, reverse, num_engines,
>> +			      &freq);
>> +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>> +
>> +	gem_close(i915, obj[0].handle);
>> +	gem_close(i915, obj[1].handle);
> 
> Close sysfs here.

Ack

> 
>> +	intel_ctx_destroy(i915, ctx);
>> +
>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>> +
>> +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
>> +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>> +}
>> +
>>   static void pm_rps_exit_handler(int sig)
>>   {
>> -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> -	} else {
>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> +	if (sysfs_files[MAX].filp) {
>> +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> +		} else {
>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>> +		}
> 
> This change looks unrelated. Please split this as separate patch
> and post as a fix.

Will do so

Many thanks,
Karolina

> Regards,
> Kamil
> 
>>   	}
>>
>>   	if (lh.igt_proc.running)
>> @@ -683,6 +924,14 @@ igt_main
>>   	igt_subtest("waitboost")
>>   		waitboost(drm_fd, false);
>>
>> +	igt_describe("Check if the order of fences does not affect waitboosting");
>> +	igt_subtest("fence-order")
>> +		fence_order(drm_fd);
>> +
>> +	igt_describe("Check if context reuse does not affect waitboosting");
>> +	igt_subtest("engine-order")
>> +		engine_order(drm_fd);
>> +
>>   	/* Test boost frequency after GPU reset */
>>   	igt_subtest("reset") {
>>   		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>> --
>> 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
       [not found]     ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
@ 2022-07-28 14:04       ` Karolina Drobnik
  2022-07-28 14:12       ` Chris Wilson
  1 sibling, 0 replies; 10+ messages in thread
From: Karolina Drobnik @ 2022-07-28 14:04 UTC (permalink / raw)
  To: Kamil Konieczny; +Cc: igt-dev, Chris Wilson

Hi,

On 28.07.2022 07:29, Kamil Konieczny wrote:
> Hi,
> 
> On 2022-07-27 at 14:27:02 +0200, Karolina Drobnik wrote:
>> Hi,
>>
>> On 26.07.2022 12:49, Kamil Konieczny wrote:
>>> Hi Karolina,
>>>
>>> few more nits, see below.
>>>
>>> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>>
>>>> Currently the wait boost heuristic is evaluated at the start of each
>>>> fence wait for a series within dma-resv. There is no strict ordering of
>>>> fences defined by dma-resv, and so it turns out that the same operation
>>>> under different circumstances can result in different heuristics being
>>>> applied, and dramatic performance variations in user applications.
>>>>
>>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>> Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
>>>> ---
>>>> v4:
>>>>     - Added test descriptions with igt_describe()
>>>> v3:
>>>>     - Removed unnecessary calls to gem_write
>>>> v2:
>>>>     - Introduced batch_create() with arbitration points to prevent hangs
>>>>     - Added engine-order subtest
>>>>     - Added reporting of GPU frequency
>>>>     - Changed runtimes and switched prints from us to ms
>>>>
>>>>    tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 255 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>>>> index b37f15c2..6125ea60 100644
>>>> --- a/tests/i915/i915_pm_rps.c
>>>> +++ b/tests/i915/i915_pm_rps.c
>>>> @@ -37,8 +37,10 @@
>>>>    #include <sys/wait.h>
>>>>
>>>>    #include "i915/gem.h"
>>>> +#include "i915/gem_create.h"
>>>>    #include "igt.h"
>>>>    #include "igt_dummyload.h"
>>>> +#include "igt_perf.h"
>>>>    #include "igt_sysfs.h"
>>>>
>>>>    IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
>>>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>>>>    	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>>>>    }
>>>>
>>>> +static uint32_t batch_create(int i915, uint64_t sz)
>>>> +{
>>>> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
>>>> +	const uint32_t chk = 0x5 << 23;
>>>> +	uint32_t handle = gem_create(i915, sz);
>>>> +
>>>> +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>>>> +		gem_write(i915, handle, pg, &chk, sizeof(chk));
>>>> +
>>>> +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>>>> +
>>>> +	return handle;
>>>> +}
>>>> +
>>>> +static uint64_t __fence_order(int i915,
>>>> +			      struct drm_i915_gem_exec_object2 *obj,
>>>> +			      struct drm_i915_gem_execbuffer2 *eb,
>>>> +			      uint64_t flags0, uint64_t flags1,
>>>> +			      double *outf)
>>>> +{
>>>> +	uint64_t before[2], after[2];
>>>> +	struct timespec tv;
>>>> +	int fd;
>>>> +
>>>> +	gem_quiescent_gpu(i915);
>>>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>>> +
>>>> +	igt_gettime(&tv);
>>>> +
>>>> +	obj->flags = flags0;
>>>> +	gem_execbuf(i915, eb);
>>>> +
>>>> +	obj->flags = flags1;
>>>> +	gem_execbuf(i915, eb);
>>>> +
>>>> +	read(fd, before, sizeof(before));
>>>> +	gem_sync(i915, obj->handle);
>>>> +	read(fd, after, sizeof(after));
>>>> +	close(fd);
>>>> +
>>>> +	after[0] -= before[0];
>>>> +	after[1] -= before[1];
>>>
>>> Put igt_assert here (check for zero before divide).
>>
>> Hmm, isn't that more of a belts and suspenders approach? It's unlikely we'd
>> get zero here. Also, don't we put asserts towards the end of the test
>> whenever possible?
> 
> It is userspace so we can afford some extra checks, so maybe
> 	if (after[1] == 0)
> 		return 0;

I agree that it won't cost much, but I just wonder if we're going to end 
up in such situation.

>>>> +
>>>> +	*outf = 1e9 * after[0] / after[1];
>>>> +	return igt_nsec_elapsed(&tv);
>>>> +}
>>>> +
>>>> +static void fence_order(int i915)
>>>> +{
>>>> +	const uint64_t sz = 512ull << 20;
>>>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>>>> +		{ .handle = gem_create(i915, 4096) },
>>>> +		{ .handle = batch_create(i915, sz) },
>>>> +	};
>>>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>>>> +		.buffers_ptr = to_user_pointer(obj),
>>>> +		.buffer_count = ARRAY_SIZE(obj),
>>>> +	};
>>>> +	uint64_t wr, rw;
>>>> +	int min, max;
>>>> +	double freq;
>>>> +	int sysfs;
>>>> +
>>>> +	/*
>>>> +	 * Check the order of fences found during GEM_WAIT does not affect
>>>> +	 * waitboosting.
>>>> +	 *
>>>> +	 * Internally, implicit fences are tracked within a dma-resv which
>>>> +	 * imposes no order on the individually fences tracked within. Since
>>>> +	 * there is no defined order, the sequence of waits (and the associated
>>>> +	 * waitboosts) is also undefined, undermining the consistency of the
>>>> +	 * waitboost heuristic.
>>>> +	 *
>>>> +	 * In particular, we can influence the sequence of fence storage
>>>> +	 * within dma-resv by mixing read/write semantics for implicit fences.
>>>> +	 * We can exploit this property of dma-resv to exercise that no matter
>>>> +	 * the stored order, the heuristic is applied consistently for the
>>>> +	 * user's GEM_WAIT ioctl.
>>>> +	 */
>>>> +
>>>> +	sysfs = igt_sysfs_open(i915);
>>>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>>> +	igt_require(max > min);
>>>> +
>>>> +	/* Only allow ourselves to upclock via waitboosting */
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>>> +
>>>> +	/* Warm up to bind the vma */
>>>> +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>>>> +
>>>> +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
>>>> +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>>>> +
>>>> +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
>>>> +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>>>> +
>>>> +	gem_close(i915, obj[0].handle);
>>>> +	gem_close(i915, obj[1].handle);
>>>
>>> Close sysfs here.
>>
>> Good catch. I'll close it before asserts (as agreed).
>>
> 
> My bad, this close should be after resetting max freq, two lines
> below. Btw why not reuse code from other tests, reading old freqs
> at beginnig of test or reusing freqs readed at fixture in
> igt_main ?

No worries. Hm, I'd say it's because of how the fixture is likely to 
change in the future.

>>>> +
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>>
>>> Here we should put previous values, not those readed from RPx,
>>> but if they are always the same it is ok.
>>
>> We restore the values we read from sysfs
>>
> 
> But we readed RPx values, not gt_min|gt_max freq ones, so here
> you can use origfreqs[MIN] and origfreqs[MAX] or declare in test
> 
> int old_freqs[NUMFREQ];
> 
> and read them with read_freqs(old_freqs) and then use for restore.
I'd keep it as it is

Many thanks,
Karolina

> Regards,
> Kamil
> 
>>>> +
>>>> +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>>>> +}
>>>> +
>>>> +static uint64_t __engine_order(int i915,
>>>> +			       struct drm_i915_gem_exec_object2 *obj,
>>>> +			       struct drm_i915_gem_execbuffer2 *eb,
>>>> +			       unsigned int *engines0,
>>>> +			       unsigned int *engines1,
>>>> +			       unsigned int num_engines,
>>>> +			       double *outf)
>>>> +{
>>>> +	uint64_t before[2], after[2];
>>>> +	struct timespec tv;
>>>> +	int fd;
>>>> +
>>>> +	gem_quiescent_gpu(i915);
>>>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>>> +
>>>> +	igt_gettime(&tv);
>>>> +
>>>> +	obj->flags = EXEC_OBJECT_WRITE;
>>>> +	for (unsigned int n = 0; n < num_engines; n++) {
>>>> +		eb->flags &= ~63ull;
>>>> +		eb->flags |= engines0[n];
>>>> +		gem_execbuf_wr(i915, eb);
>>>> +	}
>>>> +
>>>> +	obj->flags = 0;
>>>> +	for (unsigned int n = 0; n < num_engines; n++) {
>>>> +		eb->flags &= ~63ull;
>>>> +		eb->flags |= engines1[n];
>>>> +		gem_execbuf(i915, eb);
>>>> +	}
>>>> +
>>>> +	read(fd, before, sizeof(before));
>>>> +	gem_sync(i915, obj->handle);
>>>> +	read(fd, after, sizeof(after));
>>>> +	close(fd);
>>>> +
>>>> +	after[0] -= before[0];
>>>> +	after[1] -= before[1];
>>>
>>> Put igt_assert here (check for zero before divide).
>>>
>>>> +
>>>> +	*outf = 1e9 * after[0] / after[1];
>>>> +	return igt_nsec_elapsed(&tv);
>>>> +}
>>>> +
>>>> +static void engine_order(int i915)
>>>> +{
>>>> +	const uint64_t sz = 512ull << 20;
>>>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>>>> +		{ .handle = gem_create(i915, 4096) },
>>>> +		{ .handle = batch_create(i915, sz) },
>>>> +	};
>>>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>>>> +		.buffers_ptr = to_user_pointer(obj),
>>>> +		.buffer_count = ARRAY_SIZE(obj),
>>>> +	};
>>>> +	const struct intel_execution_engine2 *e;
>>>> +	unsigned int engines[2], reverse[2];
>>>> +	uint64_t forward, backward, both;
>>>> +	unsigned int num_engines;
>>>> +	const intel_ctx_t *ctx;
>>>> +	int min, max;
>>>> +	double freq;
>>>> +	int sysfs;
>>>> +
>>>> +	/*
>>>> +	 * Check the order of fences found during GEM_WAIT does not affect
>>>> +	 * waitboosting. (See fence_order())
>>>> +	 *
>>>> +	 * Another way we can manipulate the order of fences within the
>>>> +	 * dma-resv is through repeated use of the same contexts.
>>>> +	 */
>>>> +
>>>> +	num_engines = 0;
>>>> +	ctx = intel_ctx_create_all_physical(i915);
>>>> +	for_each_ctx_engine(i915, ctx, e) {
>>>> +		/*
>>>> +		 * Avoid using the cmdparser as it will try to allocate
>>>> +		 * a new shadow batch for each submission -> oom
>>>> +		 */
>>>> +		if (!gem_engine_has_mutable_submission(i915, e->class))
>>>> +			continue;
>>>> +
>>>> +		engines[num_engines++] = e->flags;
>>>> +		if (num_engines == ARRAY_SIZE(engines))
>>>> +			break;
>>>> +	}
>>>> +	igt_require(num_engines > 1);
>>>> +	for (unsigned int n = 0; n < num_engines; n++)
>>>> +		reverse[n] = engines[num_engines - n - 1];
>>>> +	execbuf.rsvd1 = ctx->id;
>>>> +
>>>> +	sysfs = igt_sysfs_open(i915);
>>>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>>> +	igt_require(max > min);
>>>> +
>>>> +	/* Only allow ourselves to upclock via waitboosting */
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>>> +
>>>> +	/* Warm up to bind the vma */
>>>> +	gem_execbuf(i915, &execbuf);
>>>> +
>>>> +	forward = __engine_order(i915, &obj[0], &execbuf,
>>>> +				 engines, engines, num_engines,
>>>> +				 &freq);
>>>> +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>>>> +
>>>> +	backward = __engine_order(i915, &obj[0], &execbuf,
>>>> +				  reverse, reverse, num_engines,
>>>> +				  &freq);
>>>> +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>>>> +
>>>> +	both = __engine_order(i915, &obj[0], &execbuf,
>>>> +			      engines, reverse, num_engines,
>>>> +			      &freq);
>>>> +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>>>> +
>>>> +	gem_close(i915, obj[0].handle);
>>>> +	gem_close(i915, obj[1].handle);
>>>
>>> Close sysfs here.
>>
>> Ack
>>
>>>
>>>> +	intel_ctx_destroy(i915, ctx);
>>>> +
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>>> +
>>>> +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
>>>> +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>>>> +}
>>>> +
>>>>    static void pm_rps_exit_handler(int sig)
>>>>    {
>>>> -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> -	} else {
>>>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> +	if (sysfs_files[MAX].filp) {
>>>> +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> +		} else {
>>>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> +		}
>>>
>>> This change looks unrelated. Please split this as separate patch
>>> and post as a fix.
>>
>> Will do so
>>
>> Many thanks,
>> Karolina
>>
>>> Regards,
>>> Kamil
>>>
>>>>    	}
>>>>
>>>>    	if (lh.igt_proc.running)
>>>> @@ -683,6 +924,14 @@ igt_main
>>>>    	igt_subtest("waitboost")
>>>>    		waitboost(drm_fd, false);
>>>>
>>>> +	igt_describe("Check if the order of fences does not affect waitboosting");
>>>> +	igt_subtest("fence-order")
>>>> +		fence_order(drm_fd);
>>>> +
>>>> +	igt_describe("Check if context reuse does not affect waitboosting");
>>>> +	igt_subtest("engine-order")
>>>> +		engine_order(drm_fd);
>>>> +
>>>>    	/* Test boost frequency after GPU reset */
>>>>    	igt_subtest("reset") {
>>>>    		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>>>> --
>>>> 2.25.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting
       [not found]     ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
  2022-07-28 14:04       ` Karolina Drobnik
@ 2022-07-28 14:12       ` Chris Wilson
  1 sibling, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2022-07-28 14:12 UTC (permalink / raw)
  To: Kamil Konieczny, igt-dev

Quoting Kamil Konieczny (2022-07-28 06:29:31)
> Hi,
> 
> On 2022-07-27 at 14:27:02 +0200, Karolina Drobnik wrote:
> > Hi,
> > 
> > On 26.07.2022 12:49, Kamil Konieczny wrote:
> > > Hi Karolina,
> > > 
> > > few more nits, see below.
> > > 
> > > On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
> > > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > > > 
> > > > Currently the wait boost heuristic is evaluated at the start of each
> > > > fence wait for a series within dma-resv. There is no strict ordering of
> > > > fences defined by dma-resv, and so it turns out that the same operation
> > > > under different circumstances can result in different heuristics being
> > > > applied, and dramatic performance variations in user applications.
> > > > 
> > > > Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > Signed-off-by: Karolina Drobnik <karolina.drobnik@intel.com>
> > > > ---
> > > > v4:
> > > >    - Added test descriptions with igt_describe()
> > > > v3:
> > > >    - Removed unnecessary calls to gem_write
> > > > v2:
> > > >    - Introduced batch_create() with arbitration points to prevent hangs
> > > >    - Added engine-order subtest
> > > >    - Added reporting of GPU frequency
> > > >    - Changed runtimes and switched prints from us to ms
> > > > 
> > > >   tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
> > > >   1 file changed, 255 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
> > > > index b37f15c2..6125ea60 100644
> > > > --- a/tests/i915/i915_pm_rps.c
> > > > +++ b/tests/i915/i915_pm_rps.c
> > > > @@ -37,8 +37,10 @@
> > > >   #include <sys/wait.h>
> > > > 
> > > >   #include "i915/gem.h"
> > > > +#include "i915/gem_create.h"
> > > >   #include "igt.h"
> > > >   #include "igt_dummyload.h"
> > > > +#include "igt_perf.h"
> > > >   #include "igt_sysfs.h"
> > > > 
> > > >   IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
> > > > @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
> > > >           igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
> > > >   }
> > > > 
> > > > +static uint32_t batch_create(int i915, uint64_t sz)
> > > > +{
> > > > + const uint32_t bbe = MI_BATCH_BUFFER_END;
> > > > + const uint32_t chk = 0x5 << 23;
> > > > + uint32_t handle = gem_create(i915, sz);
> > > > +
> > > > + for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
> > > > +         gem_write(i915, handle, pg, &chk, sizeof(chk));
> > > > +
> > > > + gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
> > > > +
> > > > + return handle;
> > > > +}
> > > > +
> > > > +static uint64_t __fence_order(int i915,
> > > > +                       struct drm_i915_gem_exec_object2 *obj,
> > > > +                       struct drm_i915_gem_execbuffer2 *eb,
> > > > +                       uint64_t flags0, uint64_t flags1,
> > > > +                       double *outf)
> > > > +{
> > > > + uint64_t before[2], after[2];
> > > > + struct timespec tv;
> > > > + int fd;
> > > > +
> > > > + gem_quiescent_gpu(i915);
> > > > + fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
> > > > +
> > > > + igt_gettime(&tv);
> > > > +
> > > > + obj->flags = flags0;
> > > > + gem_execbuf(i915, eb);
> > > > +
> > > > + obj->flags = flags1;
> > > > + gem_execbuf(i915, eb);
> > > > +
> > > > + read(fd, before, sizeof(before));
> > > > + gem_sync(i915, obj->handle);
> > > > + read(fd, after, sizeof(after));
> > > > + close(fd);
> > > > +
> > > > + after[0] -= before[0];
> > > > + after[1] -= before[1];
> > > 
> > > Put igt_assert here (check for zero before divide).
> > 
> > Hmm, isn't that more of a belts and suspenders approach? It's unlikely we'd
> > get zero here. Also, don't we put asserts towards the end of the test
> > whenever possible?
> 
> It is userspace so we can afford some extra checks, so maybe
>         if (after[1] == 0)
>                 return 0;
> > 
> > > > +
> > > > + *outf = 1e9 * after[0] / after[1];
> > > > + return igt_nsec_elapsed(&tv);
> > > > +}
> > > > +
> > > > +static void fence_order(int i915)
> > > > +{
> > > > + const uint64_t sz = 512ull << 20;
> > > > + struct drm_i915_gem_exec_object2 obj[2] = {
> > > > +         { .handle = gem_create(i915, 4096) },
> > > > +         { .handle = batch_create(i915, sz) },
> > > > + };
> > > > + struct drm_i915_gem_execbuffer2 execbuf = {
> > > > +         .buffers_ptr = to_user_pointer(obj),
> > > > +         .buffer_count = ARRAY_SIZE(obj),
> > > > + };
> > > > + uint64_t wr, rw;
> > > > + int min, max;
> > > > + double freq;
> > > > + int sysfs;
> > > > +
> > > > + /*
> > > > +  * Check the order of fences found during GEM_WAIT does not affect
> > > > +  * waitboosting.
> > > > +  *
> > > > +  * Internally, implicit fences are tracked within a dma-resv which
> > > > +  * imposes no order on the individually fences tracked within. Since
> > > > +  * there is no defined order, the sequence of waits (and the associated
> > > > +  * waitboosts) is also undefined, undermining the consistency of the
> > > > +  * waitboost heuristic.
> > > > +  *
> > > > +  * In particular, we can influence the sequence of fence storage
> > > > +  * within dma-resv by mixing read/write semantics for implicit fences.
> > > > +  * We can exploit this property of dma-resv to exercise that no matter
> > > > +  * the stored order, the heuristic is applied consistently for the
> > > > +  * user's GEM_WAIT ioctl.
> > > > +  */
> > > > +
> > > > + sysfs = igt_sysfs_open(i915);
> > > > + min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
> > > > + max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
> > > > + igt_require(max > min);
> > > > +
> > > > + /* Only allow ourselves to upclock via waitboosting */
> > > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
> > > > + igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
> > > > +
> > > > + /* Warm up to bind the vma */
> > > > + __fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
> > > > +
> > > > + wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
> > > > + igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
> > > > +
> > > > + rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
> > > > + igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
> > > > +
> > > > + gem_close(i915, obj[0].handle);
> > > > + gem_close(i915, obj[1].handle);
> > > 
> > > Close sysfs here.
> > 
> > Good catch. I'll close it before asserts (as agreed).
> > 
> 
> My bad, this close should be after resetting max freq, two lines
> below. Btw why not reuse code from other tests, reading old freqs
> at beginnig of test or reusing freqs readed at fixture in
> igt_main ?
> 
> > > > +
> > > > + igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
> > > > + igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
> > > 
> > > Here we should put previous values, not those readed from RPx,
> > > but if they are always the same it is ok.
> > 
> > We restore the values we read from sysfs
> > 
> 
> But we readed RPx values, not gt_min|gt_max freq ones, so here
> you can use origfreqs[MIN] and origfreqs[MAX] or declare in test

Not required. igt owns the device. We completely take over the device,
restoring default properties, all previous state will be lost. Anything
that pretends otherwise is misleading.
-Chris

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-07-28 14:28 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-13 10:53 [igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting Karolina Drobnik
2022-07-15 17:30 ` Kamil Konieczny
2022-07-18  9:46   ` Karolina Drobnik
2022-07-25  7:41     ` Karolina Drobnik
2022-07-25 17:06     ` Kamil Konieczny
2022-07-26 10:49 ` Kamil Konieczny
2022-07-27 12:27   ` Karolina Drobnik
     [not found]     ` <YuIeu2vSAFE8YnB3@kamilkon-desk1>
2022-07-28 14:04       ` Karolina Drobnik
2022-07-28 14:12       ` Chris Wilson
2022-07-26 14:18 ` Kamil Konieczny

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.